/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <assert.h>

#include "./aom_scale_rtcd.h"
#include "./aom_dsp_rtcd.h"
#include "./aom_config.h"

#include "aom/aom_integer.h"
#include "aom_dsp/blend.h"

#include "av1/common/blockd.h"
#include "av1/common/reconinter.h"
#include "av1/common/reconintra.h"
#if CONFIG_MOTION_VAR
#include "av1/common/onyxc_int.h"
#endif  // CONFIG_MOTION_VAR

#if CONFIG_EXT_INTER

#define NSMOOTHERS 1
#define USE_SOFT_WEIGHTS_IN_WEDGE 1
static int get_masked_weight(int m, int smoothness) {
#define SMOOTHER_LEN 32
  static const uint8_t smoothfn[NSMOOTHERS][2 * SMOOTHER_LEN + 1] = { {
#if USE_SOFT_WEIGHTS_IN_WEDGE
      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
      0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  4,  7,  13, 21, 32, 43,
      51, 57, 60, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
      64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
#else
      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  64, 64, 32, 64,
      64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
      64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
#endif  // USE_SOFT_WEIGHTS_IN_WEDGE
  } };
  if (m < -SMOOTHER_LEN)
    return 0;
  else if (m > SMOOTHER_LEN)
    return (1 << WEDGE_WEIGHT_BITS);
  else
    return smoothfn[smoothness][m + SMOOTHER_LEN];
}

// [smoother][negative][direction]
DECLARE_ALIGNED(16, static uint8_t,
                wedge_mask_obl[NSMOOTHERS][2][WEDGE_DIRECTIONS]
                              [MASK_MASTER_SIZE * MASK_MASTER_SIZE]);

DECLARE_ALIGNED(16, static uint8_t,
                wedge_signflip_lookup[BLOCK_SIZES][MAX_WEDGE_TYPES]);

// 3 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound
// on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE.
DECLARE_ALIGNED(16, static uint8_t,
                wedge_mask_buf[2 * MAX_WEDGE_TYPES * 3 * MAX_WEDGE_SQUARE]);

static wedge_masks_type wedge_masks[BLOCK_SIZES][2];

// Some unused wedge codebooks left temporarily to facilitate experiments.
// To be removed when settled.
/*
static wedge_code_type wedge_codebook_8_hgtw[8] = {
  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
};

static wedge_code_type wedge_codebook_8_hltw[8] = {
  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
};

static wedge_code_type wedge_codebook_8_heqw[8] = {
  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 6, 4 },
};
*/

#if !USE_LARGE_WEDGE_CODEBOOK
static const wedge_code_type wedge_codebook_16_hgtw[16] = {
  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
  { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
};

static const wedge_code_type wedge_codebook_16_hltw[16] = {
  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 4, 4 },
  { WEDGE_VERTICAL, 6, 4 },   { WEDGE_HORIZONTAL, 4, 4 },
  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
};

static const wedge_code_type wedge_codebook_16_heqw[16] = {
  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 6, 4 },
  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
};

const wedge_params_type wedge_params_lookup[BLOCK_SIZES] = {
#if CONFIG_CB4X4
  { 0, NULL, NULL, 0, NULL },
  { 0, NULL, NULL, 0, NULL },
  { 0, NULL, NULL, 0, NULL },
#endif  // CONFIG_CB4X4
  { 0, NULL, NULL, 0, NULL },
  { 0, NULL, NULL, 0, NULL },
  { 0, NULL, NULL, 0, NULL },
#if CONFIG_WEDGE
  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8], 0,
    wedge_masks[BLOCK_8X8] },
  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16], 0,
    wedge_masks[BLOCK_8X16] },
  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8], 0,
    wedge_masks[BLOCK_16X8] },
  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16], 0,
    wedge_masks[BLOCK_16X16] },
  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32], 0,
    wedge_masks[BLOCK_16X32] },
  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16], 0,
    wedge_masks[BLOCK_32X16] },
  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32], 0,
    wedge_masks[BLOCK_32X32] },
  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_32X64], 0,
    wedge_masks[BLOCK_32X64] },
  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_64X32], 0,
    wedge_masks[BLOCK_64X32] },
  { 0, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_64X64], 0,
    wedge_masks[BLOCK_64X64] },
#else
  { 0, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8], 0,
    wedge_masks[BLOCK_8X8] },
  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16], 0,
    wedge_masks[BLOCK_8X16] },
  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8], 0,
    wedge_masks[BLOCK_16X8] },
  { 0, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16], 0,
    wedge_masks[BLOCK_16X16] },
  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32], 0,
    wedge_masks[BLOCK_16X32] },
  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16], 0,
    wedge_masks[BLOCK_32X16] },
  { 0, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32], 0,
    wedge_masks[BLOCK_32X32] },
  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_32X64], 0,
    wedge_masks[BLOCK_32X64] },
  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_64X32], 0,
    wedge_masks[BLOCK_64X32] },
  { 0, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_64X64], 0,
    wedge_masks[BLOCK_64X64] },
#endif  // CONFIG_WEDGE
#if CONFIG_EXT_PARTITION
  { 0, NULL, NULL, 0, NULL },
  { 0, NULL, NULL, 0, NULL },
  { 0, NULL, NULL, 0, NULL },
#endif  // CONFIG_EXT_PARTITION
};

#else

static const wedge_code_type wedge_codebook_32_hgtw[32] = {
  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
  { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
  { WEDGE_OBLIQUE27, 4, 1 },  { WEDGE_OBLIQUE27, 4, 2 },
  { WEDGE_OBLIQUE27, 4, 3 },  { WEDGE_OBLIQUE27, 4, 5 },
  { WEDGE_OBLIQUE27, 4, 6 },  { WEDGE_OBLIQUE27, 4, 7 },
  { WEDGE_OBLIQUE153, 4, 1 }, { WEDGE_OBLIQUE153, 4, 2 },
  { WEDGE_OBLIQUE153, 4, 3 }, { WEDGE_OBLIQUE153, 4, 5 },
  { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE153, 4, 7 },
  { WEDGE_OBLIQUE63, 1, 4 },  { WEDGE_OBLIQUE63, 2, 4 },
  { WEDGE_OBLIQUE63, 3, 4 },  { WEDGE_OBLIQUE63, 5, 4 },
  { WEDGE_OBLIQUE63, 6, 4 },  { WEDGE_OBLIQUE63, 7, 4 },
  { WEDGE_OBLIQUE117, 1, 4 }, { WEDGE_OBLIQUE117, 2, 4 },
  { WEDGE_OBLIQUE117, 3, 4 }, { WEDGE_OBLIQUE117, 5, 4 },
  { WEDGE_OBLIQUE117, 6, 4 }, { WEDGE_OBLIQUE117, 7, 4 },
};

static const wedge_code_type wedge_codebook_32_hltw[32] = {
  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 4, 4 },
  { WEDGE_VERTICAL, 6, 4 },   { WEDGE_HORIZONTAL, 4, 4 },
  { WEDGE_OBLIQUE27, 4, 1 },  { WEDGE_OBLIQUE27, 4, 2 },
  { WEDGE_OBLIQUE27, 4, 3 },  { WEDGE_OBLIQUE27, 4, 5 },
  { WEDGE_OBLIQUE27, 4, 6 },  { WEDGE_OBLIQUE27, 4, 7 },
  { WEDGE_OBLIQUE153, 4, 1 }, { WEDGE_OBLIQUE153, 4, 2 },
  { WEDGE_OBLIQUE153, 4, 3 }, { WEDGE_OBLIQUE153, 4, 5 },
  { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE153, 4, 7 },
  { WEDGE_OBLIQUE63, 1, 4 },  { WEDGE_OBLIQUE63, 2, 4 },
  { WEDGE_OBLIQUE63, 3, 4 },  { WEDGE_OBLIQUE63, 5, 4 },
  { WEDGE_OBLIQUE63, 6, 4 },  { WEDGE_OBLIQUE63, 7, 4 },
  { WEDGE_OBLIQUE117, 1, 4 }, { WEDGE_OBLIQUE117, 2, 4 },
  { WEDGE_OBLIQUE117, 3, 4 }, { WEDGE_OBLIQUE117, 5, 4 },
  { WEDGE_OBLIQUE117, 6, 4 }, { WEDGE_OBLIQUE117, 7, 4 },
};

static const wedge_code_type wedge_codebook_32_heqw[32] = {
  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 6, 4 },
  { WEDGE_OBLIQUE27, 4, 1 },  { WEDGE_OBLIQUE27, 4, 2 },
  { WEDGE_OBLIQUE27, 4, 3 },  { WEDGE_OBLIQUE27, 4, 5 },
  { WEDGE_OBLIQUE27, 4, 6 },  { WEDGE_OBLIQUE27, 4, 7 },
  { WEDGE_OBLIQUE153, 4, 1 }, { WEDGE_OBLIQUE153, 4, 2 },
  { WEDGE_OBLIQUE153, 4, 3 }, { WEDGE_OBLIQUE153, 4, 5 },
  { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE153, 4, 7 },
  { WEDGE_OBLIQUE63, 1, 4 },  { WEDGE_OBLIQUE63, 2, 4 },
  { WEDGE_OBLIQUE63, 3, 4 },  { WEDGE_OBLIQUE63, 5, 4 },
  { WEDGE_OBLIQUE63, 6, 4 },  { WEDGE_OBLIQUE63, 7, 4 },
  { WEDGE_OBLIQUE117, 1, 4 }, { WEDGE_OBLIQUE117, 2, 4 },
  { WEDGE_OBLIQUE117, 3, 4 }, { WEDGE_OBLIQUE117, 5, 4 },
  { WEDGE_OBLIQUE117, 6, 4 }, { WEDGE_OBLIQUE117, 7, 4 },
};

const wedge_params_type wedge_params_lookup[BLOCK_SIZES] = {
#if CONFIG_CB4X4
  { 0, NULL, NULL, 0, NULL },
  { 0, NULL, NULL, 0, NULL },
  { 0, NULL, NULL, 0, NULL },
#endif
  { 0, NULL, NULL, 0, NULL },
  { 0, NULL, NULL, 0, NULL },
  { 0, NULL, NULL, 0, NULL },
#if CONFIG_WEDGE
  { 5, wedge_codebook_32_heqw, wedge_signflip_lookup[BLOCK_8X8], 0,
    wedge_masks[BLOCK_8X8] },
  { 5, wedge_codebook_32_hgtw, wedge_signflip_lookup[BLOCK_8X16], 0,
    wedge_masks[BLOCK_8X16] },
  { 5, wedge_codebook_32_hltw, wedge_signflip_lookup[BLOCK_16X8], 0,
    wedge_masks[BLOCK_16X8] },
  { 5, wedge_codebook_32_heqw, wedge_signflip_lookup[BLOCK_16X16], 0,
    wedge_masks[BLOCK_16X16] },
  { 5, wedge_codebook_32_hgtw, wedge_signflip_lookup[BLOCK_16X32], 0,
    wedge_masks[BLOCK_16X32] },
  { 5, wedge_codebook_32_hltw, wedge_signflip_lookup[BLOCK_32X16], 0,
    wedge_masks[BLOCK_32X16] },
  { 5, wedge_codebook_32_heqw, wedge_signflip_lookup[BLOCK_32X32], 0,
    wedge_masks[BLOCK_32X32] },
  { 0, wedge_codebook_32_hgtw, wedge_signflip_lookup[BLOCK_32X64], 0,
    wedge_masks[BLOCK_32X64] },
  { 0, wedge_codebook_32_hltw, wedge_signflip_lookup[BLOCK_64X32], 0,
    wedge_masks[BLOCK_64X32] },
  { 0, wedge_codebook_32_heqw, wedge_signflip_lookup[BLOCK_64X64], 0,
    wedge_masks[BLOCK_64X64] },
#else
  { 0, wedge_codebook_32_heqw, wedge_signflip_lookup[BLOCK_8X8], 0,
    wedge_masks[BLOCK_8X8] },
  { 0, wedge_codebook_32_hgtw, wedge_signflip_lookup[BLOCK_8X16], 0,
    wedge_masks[BLOCK_8X16] },
  { 0, wedge_codebook_32_hltw, wedge_signflip_lookup[BLOCK_16X8], 0,
    wedge_masks[BLOCK_16X8] },
  { 0, wedge_codebook_32_heqw, wedge_signflip_lookup[BLOCK_16X16], 0,
    wedge_masks[BLOCK_16X16] },
  { 0, wedge_codebook_32_hgtw, wedge_signflip_lookup[BLOCK_16X32], 0,
    wedge_masks[BLOCK_16X32] },
  { 0, wedge_codebook_32_hltw, wedge_signflip_lookup[BLOCK_32X16], 0,
    wedge_masks[BLOCK_32X16] },
  { 0, wedge_codebook_32_heqw, wedge_signflip_lookup[BLOCK_32X32], 0,
    wedge_masks[BLOCK_32X32] },
  { 0, wedge_codebook_32_hgtw, wedge_signflip_lookup[BLOCK_32X64], 0,
    wedge_masks[BLOCK_32X64] },
  { 0, wedge_codebook_32_hltw, wedge_signflip_lookup[BLOCK_64X32], 0,
    wedge_masks[BLOCK_64X32] },
  { 0, wedge_codebook_32_heqw, wedge_signflip_lookup[BLOCK_64X64], 0,
    wedge_masks[BLOCK_64X64] },
#endif  // CONFIG_WEDGE
#if CONFIG_EXT_PARTITION
  { 0, NULL, NULL, 0, NULL },
  { 0, NULL, NULL, 0, NULL },
  { 0, NULL, NULL, 0, NULL },
#endif  // CONFIG_EXT_PARTITION
};
#endif  // USE_LARGE_WEDGE_CODEBOOK

static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
                                             BLOCK_SIZE sb_type) {
  const uint8_t *master;
  const int bh = block_size_high[sb_type];
  const int bw = block_size_wide[sb_type];
  const wedge_code_type *a =
      wedge_params_lookup[sb_type].codebook + wedge_index;
  const int smoother = wedge_params_lookup[sb_type].smoother;
  int woff, hoff;
  const uint8_t wsignflip = wedge_params_lookup[sb_type].signflip[wedge_index];

  assert(wedge_index >= 0 &&
         wedge_index < (1 << get_wedge_bits_lookup(sb_type)));
  woff = (a->x_offset * bw) >> 3;
  hoff = (a->y_offset * bh) >> 3;
  master = wedge_mask_obl[smoother][neg ^ wsignflip][a->direction] +
           MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
           MASK_MASTER_SIZE / 2 - woff;
  return master;
}

const uint8_t *av1_get_soft_mask(int wedge_index, int wedge_sign,
                                 BLOCK_SIZE sb_type, int offset_x,
                                 int offset_y) {
  const uint8_t *mask =
      get_wedge_mask_inplace(wedge_index, wedge_sign, sb_type);
  if (mask) mask -= (offset_x + offset_y * MASK_MASTER_STRIDE);
  return mask;
}

#if CONFIG_COMPOUND_SEGMENT
static uint8_t *invert_mask(uint8_t *mask_inv_buffer, const uint8_t *const mask,
                            int h, int w, int stride) {
  int i, j;

  for (i = 0; i < h; ++i)
    for (j = 0; j < w; ++j) {
      mask_inv_buffer[i * stride + j] =
          AOM_BLEND_A64_MAX_ALPHA - mask[i * stride + j];
    }
  return mask_inv_buffer;
}
#endif  // CONFIG_COMPOUND_SEGMENT

const uint8_t *av1_get_compound_type_mask_inverse(
    const INTERINTER_COMPOUND_DATA *const comp_data,
#if CONFIG_COMPOUND_SEGMENT
    uint8_t *mask_buffer, int h, int w, int stride,
#endif
    BLOCK_SIZE sb_type) {
  assert(is_masked_compound_type(comp_data->type));
  switch (comp_data->type) {
    case COMPOUND_WEDGE:
      return av1_get_contiguous_soft_mask(comp_data->wedge_index,
                                          !comp_data->wedge_sign, sb_type);
#if CONFIG_COMPOUND_SEGMENT
    case COMPOUND_SEG:
      return invert_mask(mask_buffer, comp_data->seg_mask, h, w, stride);
#endif  // CONFIG_COMPOUND_SEGMENT
    default: assert(0); return NULL;
  }
}

const uint8_t *av1_get_compound_type_mask(
    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type) {
  assert(is_masked_compound_type(comp_data->type));
  switch (comp_data->type) {
    case COMPOUND_WEDGE:
      return av1_get_contiguous_soft_mask(comp_data->wedge_index,
                                          comp_data->wedge_sign, sb_type);
#if CONFIG_COMPOUND_SEGMENT
    case COMPOUND_SEG: return comp_data->seg_mask;
#endif  // CONFIG_COMPOUND_SEGMENT
    default: assert(0); return NULL;
  }
}

#if CONFIG_COMPOUND_SEGMENT
#if COMPOUND_SEGMENT_TYPE == 0
static void uniform_mask(uint8_t *mask, int which_inverse, BLOCK_SIZE sb_type,
                         int h, int w, int mask_val) {
  int i, j;
  int block_stride = block_size_wide[sb_type];
  for (i = 0; i < h; ++i)
    for (j = 0; j < w; ++j) {
      mask[i * block_stride + j] =
          which_inverse ? AOM_BLEND_A64_MAX_ALPHA - mask_val : mask_val;
    }
}

void build_compound_seg_mask(uint8_t *mask, SEG_MASK_TYPE mask_type,
                             const uint8_t *src0, int src0_stride,
                             const uint8_t *src1, int src1_stride,
                             BLOCK_SIZE sb_type, int h, int w) {
  (void)src0;
  (void)src1;
  (void)src0_stride;
  (void)src1_stride;
  switch (mask_type) {
    case UNIFORM_45: uniform_mask(mask, 0, sb_type, h, w, 45); break;
    case UNIFORM_45_INV: uniform_mask(mask, 1, sb_type, h, w, 45); break;
    default: assert(0);
  }
}

#if CONFIG_AOM_HIGHBITDEPTH
void build_compound_seg_mask_highbd(uint8_t *mask, SEG_MASK_TYPE mask_type,
                                    const uint8_t *src0, int src0_stride,
                                    const uint8_t *src1, int src1_stride,
                                    BLOCK_SIZE sb_type, int h, int w, int bd) {
  (void)src0;
  (void)src1;
  (void)src0_stride;
  (void)src1_stride;
  (void)bd;
  switch (mask_type) {
    case UNIFORM_45: uniform_mask(mask, 0, sb_type, h, w, 45); break;
    case UNIFORM_45_INV: uniform_mask(mask, 1, sb_type, h, w, 45); break;
    default: assert(0);
  }
}
#endif  // CONFIG_AOM_HIGHBITDEPTH

#elif COMPOUND_SEGMENT_TYPE == 1
#define DIFF_FACTOR 16
static void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base,
                         const uint8_t *src0, int src0_stride,
                         const uint8_t *src1, int src1_stride,
                         BLOCK_SIZE sb_type, int h, int w) {
  int i, j, m, diff;
  int block_stride = block_size_wide[sb_type];
  for (i = 0; i < h; ++i) {
    for (j = 0; j < w; ++j) {
      diff =
          abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]);
      m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
      mask[i * block_stride + j] =
          which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
    }
  }
}

void build_compound_seg_mask(uint8_t *mask, SEG_MASK_TYPE mask_type,
                             const uint8_t *src0, int src0_stride,
                             const uint8_t *src1, int src1_stride,
                             BLOCK_SIZE sb_type, int h, int w) {
  switch (mask_type) {
    case DIFFWTD_42:
      diffwtd_mask(mask, 0, 42, src0, src0_stride, src1, src1_stride, sb_type,
                   h, w);
      break;
    case DIFFWTD_42_INV:
      diffwtd_mask(mask, 1, 42, src0, src0_stride, src1, src1_stride, sb_type,
                   h, w);
      break;
    default: assert(0);
  }
}

#if CONFIG_AOM_HIGHBITDEPTH
static void diffwtd_mask_highbd(uint8_t *mask, int which_inverse, int mask_base,
                                const uint16_t *src0, int src0_stride,
                                const uint16_t *src1, int src1_stride,
                                BLOCK_SIZE sb_type, int h, int w, int bd) {
  int i, j, m, diff;
  int block_stride = block_size_wide[sb_type];
  for (i = 0; i < h; ++i) {
    for (j = 0; j < w; ++j) {
      diff = abs((int)src0[i * src0_stride + j] -
                 (int)src1[i * src1_stride + j]) >>
             (bd - 8);
      m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
      mask[i * block_stride + j] =
          which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
    }
  }
}

void build_compound_seg_mask_highbd(uint8_t *mask, SEG_MASK_TYPE mask_type,
                                    const uint8_t *src0, int src0_stride,
                                    const uint8_t *src1, int src1_stride,
                                    BLOCK_SIZE sb_type, int h, int w, int bd) {
  switch (mask_type) {
    case DIFFWTD_42:
      diffwtd_mask_highbd(mask, 0, 42, CONVERT_TO_SHORTPTR(src0), src0_stride,
                          CONVERT_TO_SHORTPTR(src1), src1_stride, sb_type, h, w,
                          bd);
      break;
    case DIFFWTD_42_INV:
      diffwtd_mask_highbd(mask, 1, 42, CONVERT_TO_SHORTPTR(src0), src0_stride,
                          CONVERT_TO_SHORTPTR(src1), src1_stride, sb_type, h, w,
                          bd);
      break;
    default: assert(0);
  }
}
#endif  // CONFIG_AOM_HIGHBITDEPTH
#endif  // COMPOUND_SEGMENT_TYPE
#endif  // CONFIG_COMPOUND_SEGMENT

static void init_wedge_master_masks() {
  int i, j, s;
  const int w = MASK_MASTER_SIZE;
  const int h = MASK_MASTER_SIZE;
  const int stride = MASK_MASTER_STRIDE;
  const int a[2] = { 2, 1 };
  const double asqrt = sqrt(a[0] * a[0] + a[1] * a[1]);
  for (s = 0; s < NSMOOTHERS; s++) {
    for (i = 0; i < h; ++i)
      for (j = 0; j < w; ++j) {
        int x = (2 * j + 1 - w);
        int y = (2 * i + 1 - h);
        int m = (int)rint((a[0] * x + a[1] * y) / asqrt);
        wedge_mask_obl[s][1][WEDGE_OBLIQUE63][i * stride + j] =
            wedge_mask_obl[s][1][WEDGE_OBLIQUE27][j * stride + i] =
                get_masked_weight(m, s);
        wedge_mask_obl[s][1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
            wedge_mask_obl[s][1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
                (1 << WEDGE_WEIGHT_BITS) - get_masked_weight(m, s);
        wedge_mask_obl[s][0][WEDGE_OBLIQUE63][i * stride + j] =
            wedge_mask_obl[s][0][WEDGE_OBLIQUE27][j * stride + i] =
                (1 << WEDGE_WEIGHT_BITS) - get_masked_weight(m, s);
        wedge_mask_obl[s][0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
            wedge_mask_obl[s][0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
                get_masked_weight(m, s);
        wedge_mask_obl[s][1][WEDGE_VERTICAL][i * stride + j] =
            wedge_mask_obl[s][1][WEDGE_HORIZONTAL][j * stride + i] =
                get_masked_weight(x, s);
        wedge_mask_obl[s][0][WEDGE_VERTICAL][i * stride + j] =
            wedge_mask_obl[s][0][WEDGE_HORIZONTAL][j * stride + i] =
                (1 << WEDGE_WEIGHT_BITS) - get_masked_weight(x, s);
      }
  }
}

// If the signs for the wedges for various blocksizes are
// inconsistent flip the sign flag. Do it only once for every
// wedge codebook.
static void init_wedge_signs() {
  BLOCK_SIZE sb_type;
  memset(wedge_signflip_lookup, 0, sizeof(wedge_signflip_lookup));
  for (sb_type = BLOCK_4X4; sb_type < BLOCK_SIZES; ++sb_type) {
    const int bw = block_size_wide[sb_type];
    const int bh = block_size_high[sb_type];
    const wedge_params_type wedge_params = wedge_params_lookup[sb_type];
    const int wbits = wedge_params.bits;
    const int wtypes = 1 << wbits;
    int i, w;
    if (wbits == 0) continue;
    for (w = 0; w < wtypes; ++w) {
      const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type);
      int sum = 0;
      for (i = 0; i < bw; ++i) sum += mask[i];
      for (i = 0; i < bh; ++i) sum += mask[i * MASK_MASTER_STRIDE];
      sum = (sum + (bw + bh) / 2) / (bw + bh);
      wedge_params.signflip[w] = (sum < 32);
    }
  }
}

static void init_wedge_masks() {
  uint8_t *dst = wedge_mask_buf;
  BLOCK_SIZE bsize;
  memset(wedge_masks, 0, sizeof(wedge_masks));
  for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES; ++bsize) {
    const uint8_t *mask;
    const int bw = block_size_wide[bsize];
    const int bh = block_size_high[bsize];
    const wedge_params_type *wedge_params = &wedge_params_lookup[bsize];
    const int wbits = wedge_params->bits;
    const int wtypes = 1 << wbits;
    int w;
    if (wbits == 0) continue;
    for (w = 0; w < wtypes; ++w) {
      mask = get_wedge_mask_inplace(w, 0, bsize);
      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
                        bh);
      wedge_params->masks[0][w] = dst;
      dst += bw * bh;

      mask = get_wedge_mask_inplace(w, 1, bsize);
      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
                        bh);
      wedge_params->masks[1][w] = dst;
      dst += bw * bh;
    }
    assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf));
  }
}

// Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
void av1_init_wedge_masks() {
  init_wedge_master_masks();
  init_wedge_signs();
  init_wedge_masks();
}

#if CONFIG_SUPERTX
static void build_masked_compound_wedge_extend(
    uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
    const uint8_t *src1, int src1_stride,
    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type,
    int wedge_offset_x, int wedge_offset_y, int h, int w) {
  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
  const uint8_t *mask;
  size_t mask_stride;
  switch (comp_data->type) {
    case COMPOUND_WEDGE:
      mask = av1_get_soft_mask(comp_data->wedge_index, comp_data->wedge_sign,
                               sb_type, wedge_offset_x, wedge_offset_y);
      mask_stride = MASK_MASTER_STRIDE;
      break;
#if CONFIG_COMPOUND_SEGMENT
    case COMPOUND_SEG:
      mask = comp_data->seg_mask;
      mask_stride = block_size_wide[sb_type];
      break;
#endif
    default: assert(0); return;
  }
  aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
                     mask, mask_stride, h, w, subh, subw);
}

#if CONFIG_AOM_HIGHBITDEPTH
static void build_masked_compound_wedge_extend_highbd(
    uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
    const uint8_t *src1_8, int src1_stride,
    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type,
    int wedge_offset_x, int wedge_offset_y, int h, int w, int bd) {
  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
  const uint8_t *mask;
  size_t mask_stride;
  switch (comp_data->type) {
    case COMPOUND_WEDGE:
      mask = av1_get_soft_mask(comp_data->wedge_index, comp_data->wedge_sign,
                               sb_type, wedge_offset_x, wedge_offset_y);
      mask_stride = MASK_MASTER_STRIDE;
      break;
#if CONFIG_COMPOUND_SEGMENT
    case COMPOUND_SEG:
      mask = comp_data->seg_mask;
      mask_stride = block_size_wide[sb_type];
      break;
#endif
    default: assert(0); return;
  }
  aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
                            src1_stride, mask, mask_stride, h, w, subh, subw,
                            bd);
}
#endif  // CONFIG_AOM_HIGHBITDEPTH
#else
static void build_masked_compound(
    uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
    const uint8_t *src1, int src1_stride,
    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
    int w) {
  // Derive subsampling from h and w passed in. May be refactored to
  // pass in subsampling factors directly.
  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
  aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
                     mask, block_size_wide[sb_type], h, w, subh, subw);
}

#if CONFIG_AOM_HIGHBITDEPTH
static void build_masked_compound_highbd(
    uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
    const uint8_t *src1_8, int src1_stride,
    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
    int w, int bd) {
  // Derive subsampling from h and w passed in. May be refactored to
  // pass in subsampling factors directly.
  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
  // const uint8_t *mask =
  //     av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
  aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
                            src1_stride, mask, block_size_wide[sb_type], h, w,
                            subh, subw, bd);
}
#endif  // CONFIG_AOM_HIGHBITDEPTH
#endif  // CONFIG_SUPERTX

void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
                                     uint8_t *dst, int dst_stride,
                                     const int subpel_x, const int subpel_y,
                                     const struct scale_factors *sf, int w,
                                     int h,
#if CONFIG_DUAL_FILTER
                                     const InterpFilter *interp_filter,
#else
                                     const InterpFilter interp_filter,
#endif
                                     int xs, int ys,
#if CONFIG_SUPERTX
                                     int wedge_offset_x, int wedge_offset_y,
#endif  // CONFIG_SUPERTX
                                     int plane,
#if CONFIG_GLOBAL_MOTION
                                     int is_global, int p_col, int p_row,
                                     int ref,
#endif  // CONFIG_GLOBAL_MOTION
                                     MACROBLOCKD *xd) {
  MODE_INFO *mi = xd->mi[0];
  INTERINTER_COMPOUND_DATA *comp_data = &mi->mbmi.interinter_compound_data;
// The prediction filter types used here should be those for
// the second reference block.
#if CONFIG_DUAL_FILTER
  InterpFilter tmp_ipf[4] = {
    interp_filter[2], interp_filter[3], interp_filter[2], interp_filter[3],
  };
#else
  InterpFilter tmp_ipf = interp_filter;
#endif  // CONFIG_DUAL_FILTER
  ConvolveParams conv_params = get_conv_params(0, plane);

#if CONFIG_AOM_HIGHBITDEPTH
  DECLARE_ALIGNED(16, uint8_t, tmp_dst_[2 * MAX_SB_SQUARE]);
  uint8_t *tmp_dst = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
                         ? CONVERT_TO_BYTEPTR(tmp_dst_)
                         : tmp_dst_;
  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_x,
                           subpel_y, sf, w, h, &conv_params, tmp_ipf,
#if CONFIG_GLOBAL_MOTION
                           is_global, p_col, p_row, plane, ref,
#if CONFIG_MOTION_VAR
                           0, 0,
#endif
#endif  // CONFIG_GLOBAL_MOTION
                           xs, ys, xd);
#if CONFIG_COMPOUND_SEGMENT
  if (!plane && comp_data->type == COMPOUND_SEG) {
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
      build_compound_seg_mask_highbd(comp_data->seg_mask, comp_data->mask_type,
                                     dst, dst_stride, tmp_dst, MAX_SB_SIZE,
                                     mi->mbmi.sb_type, h, w, xd->bd);
    else
      build_compound_seg_mask(comp_data->seg_mask, comp_data->mask_type, dst,
                              dst_stride, tmp_dst, MAX_SB_SIZE,
                              mi->mbmi.sb_type, h, w);
  }
#endif  // CONFIG_COMPOUND_SEGMENT

#if CONFIG_SUPERTX
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
    build_masked_compound_wedge_extend_highbd(
        dst, dst_stride, dst, dst_stride, tmp_dst, MAX_SB_SIZE, comp_data,
        mi->mbmi.sb_type, wedge_offset_x, wedge_offset_y, h, w, xd->bd);
  else
    build_masked_compound_wedge_extend(
        dst, dst_stride, dst, dst_stride, tmp_dst, MAX_SB_SIZE, comp_data,
        mi->mbmi.sb_type, wedge_offset_x, wedge_offset_y, h, w);
#else
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
    build_masked_compound_highbd(dst, dst_stride, dst, dst_stride, tmp_dst,
                                 MAX_SB_SIZE, comp_data, mi->mbmi.sb_type, h, w,
                                 xd->bd);
  else
    build_masked_compound(dst, dst_stride, dst, dst_stride, tmp_dst,
                          MAX_SB_SIZE, comp_data, mi->mbmi.sb_type, h, w);
#endif  // CONFIG_SUPERTX

#else  // CONFIG_AOM_HIGHBITDEPTH
  DECLARE_ALIGNED(16, uint8_t, tmp_dst[MAX_SB_SQUARE]);
  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_x,
                           subpel_y, sf, w, h, &conv_params, tmp_ipf,
#if CONFIG_GLOBAL_MOTION
                           is_global, p_col, p_row, plane, ref,
#if CONFIG_MOTION_VAR
                           0, 0,
#endif
#endif  // CONFIG_GLOBAL_MOTION
                           xs, ys, xd);
#if CONFIG_COMPOUND_SEGMENT
  if (!plane && comp_data->type == COMPOUND_SEG)
    build_compound_seg_mask(comp_data->seg_mask, comp_data->mask_type, dst,
                            dst_stride, tmp_dst, MAX_SB_SIZE, mi->mbmi.sb_type,
                            h, w);
#endif  // CONFIG_COMPOUND_SEGMENT
#if CONFIG_SUPERTX
  build_masked_compound_wedge_extend(dst, dst_stride, dst, dst_stride, tmp_dst,
                                     MAX_SB_SIZE, comp_data, mi->mbmi.sb_type,
                                     wedge_offset_x, wedge_offset_y, h, w);
#else
  build_masked_compound(dst, dst_stride, dst, dst_stride, tmp_dst, MAX_SB_SIZE,
                        comp_data, mi->mbmi.sb_type, h, w);
#endif  // CONFIG_SUPERTX
#endif  // CONFIG_AOM_HIGHBITDEPTH
#if CONFIG_COMPOUND_SEGMENT
  (void)plane;
#endif  // CONFIG_COMPOUND_SEGMENT
}
#endif  // CONFIG_EXT_INTER

// TODO(sarahparker) av1_highbd_build_inter_predictor and
// av1_build_inter_predictor should be combined with
// av1_make_inter_predictor
#if CONFIG_AOM_HIGHBITDEPTH
void av1_highbd_build_inter_predictor(const uint8_t *src, int src_stride,
                                      uint8_t *dst, int dst_stride,
                                      const MV *src_mv,
                                      const struct scale_factors *sf, int w,
                                      int h, int ref,
#if CONFIG_DUAL_FILTER
                                      const InterpFilter *interp_filter,
#else
                                      const InterpFilter interp_filter,
#endif
#if CONFIG_GLOBAL_MOTION
                                      int is_global, int p_col, int p_row,
#endif  // CONFIG_GLOBAL_MOTION
                                      int plane, enum mv_precision precision,
                                      int x, int y, const MACROBLOCKD *xd) {
  const int is_q4 = precision == MV_PRECISION_Q4;
  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
                     is_q4 ? src_mv->col : src_mv->col * 2 };
  MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
  const int subpel_x = mv.col & SUBPEL_MASK;
  const int subpel_y = mv.row & SUBPEL_MASK;
  ConvolveParams conv_params = get_conv_params(ref, plane);

  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);

  av1_make_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
                           sf, w, h, &conv_params, interp_filter,
#if CONFIG_GLOBAL_MOTION
                           is_global, p_col, p_row, plane, ref,
#if CONFIG_MOTION_VAR
                           0, 0,
#endif
#endif  // CONFIG_GLOBAL_MOTION
                           sf->x_step_q4, sf->y_step_q4, xd);
}
#endif  // CONFIG_AOM_HIGHBITDEPTH

void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
                               int dst_stride, const MV *src_mv,
                               const struct scale_factors *sf, int w, int h,
                               ConvolveParams *conv_params,
#if CONFIG_DUAL_FILTER
                               const InterpFilter *interp_filter,
#else
                               const InterpFilter interp_filter,
#endif
#if CONFIG_GLOBAL_MOTION
                               int is_global, int p_col, int p_row, int plane,
                               int ref,
#endif  // CONFIG_GLOBAL_MOTION
                               enum mv_precision precision, int x, int y,
                               const MACROBLOCKD *xd) {
  const int is_q4 = precision == MV_PRECISION_Q4;
  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
                     is_q4 ? src_mv->col : src_mv->col * 2 };
  MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
  const int subpel_x = mv.col & SUBPEL_MASK;
  const int subpel_y = mv.row & SUBPEL_MASK;

  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);

  av1_make_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
                           sf, w, h, conv_params, interp_filter,
#if CONFIG_GLOBAL_MOTION
                           is_global, p_col, p_row, plane, ref,
#if CONFIG_MOTION_VAR
                           0, 0,
#endif
#endif  // CONFIG_GLOBAL_MOTION
                           sf->x_step_q4, sf->y_step_q4, xd);
}

typedef struct SubpelParams {
  int xs;
  int ys;
  int subpel_x;
  int subpel_y;
} SubpelParams;

void build_inter_predictors(MACROBLOCKD *xd, int plane,
#if CONFIG_MOTION_VAR
                            int mi_col_offset, int mi_row_offset,
#endif  // CONFIG_MOTION_VAR
                            int block, int bw, int bh, int x, int y, int w,
                            int h,
#if CONFIG_SUPERTX && CONFIG_EXT_INTER
                            int wedge_offset_x, int wedge_offset_y,
#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
                            int mi_x, int mi_y) {
  struct macroblockd_plane *const pd = &xd->plane[plane];
#if CONFIG_MOTION_VAR
  const MODE_INFO *mi = xd->mi[mi_col_offset + xd->mi_stride * mi_row_offset];
#if !CONFIG_CB4X4 || CONFIG_SUB8X8_MC
  const int build_for_obmc = !(mi_col_offset == 0 && mi_row_offset == 0);
#endif  // !CONFIG_CB4X4 || CONFIG_SUB8X8_MC
#else
  const MODE_INFO *mi = xd->mi[0];
#endif  // CONFIG_MOTION_VAR
  const int is_compound = has_second_ref(&mi->mbmi);
  int ref;
#if CONFIG_GLOBAL_MOTION
  int is_global[2];
  for (ref = 0; ref < 1 + is_compound; ++ref) {
    WarpedMotionParams *const wm = &xd->global_motion[mi->mbmi.ref_frame[ref]];
    is_global[ref] = is_global_mv_block(mi, block, wm->wmtype);
  }
#endif  // CONFIG_GLOBAL_MOTION

#if CONFIG_CB4X4
  (void)block;
#endif

#if CONFIG_SUB8X8_MC
#if CONFIG_MOTION_VAR
  if (mi->mbmi.sb_type < BLOCK_8X8 && plane > 0 && !build_for_obmc) {
#else
  if (mi->mbmi.sb_type < BLOCK_8X8 && plane > 0) {
#endif  // CONFIG_MOTION_VAR
    // block size in log2
    const int b4_wl = b_width_log2_lookup[mi->mbmi.sb_type];
    const int b4_hl = b_height_log2_lookup[mi->mbmi.sb_type];
    const int b8_sl = b_width_log2_lookup[BLOCK_8X8];

    // block size
    const int b4_w = 1 << b4_wl;
    const int b4_h = 1 << b4_hl;
    const int b8_s = 1 << b8_sl;
    int idx, idy;

    const int x_base = x;
    const int y_base = y;

    // processing unit size
    const int x_step = w >> (b8_sl - b4_wl);
    const int y_step = h >> (b8_sl - b4_hl);

    for (idy = 0; idy < b8_s; idy += b4_h) {
      for (idx = 0; idx < b8_s; idx += b4_w) {
        const int chr_idx = (idy * 2) + idx;
        for (ref = 0; ref < 1 + is_compound; ++ref) {
          const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
          struct buf_2d *const pre_buf = &pd->pre[ref];
          struct buf_2d *const dst_buf = &pd->dst;
          uint8_t *dst = dst_buf->buf;
          const MV mv = mi->bmi[chr_idx].as_mv[ref].as_mv;
          const MV mv_q4 = clamp_mv_to_umv_border_sb(
              xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
          uint8_t *pre;
          MV32 scaled_mv;
          int xs, ys, subpel_x, subpel_y;
          const int is_scaled = av1_is_scaled(sf);
          ConvolveParams conv_params = get_conv_params(ref, plane);

          x = x_base + idx * x_step;
          y = y_base + idy * y_step;

          dst += dst_buf->stride * y + x;

          if (is_scaled) {
            pre =
                pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
            scaled_mv = av1_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
            xs = sf->x_step_q4;
            ys = sf->y_step_q4;
          } else {
            pre = pre_buf->buf + y * pre_buf->stride + x;
            scaled_mv.row = mv_q4.row;
            scaled_mv.col = mv_q4.col;
            xs = ys = 16;
          }

          subpel_x = scaled_mv.col & SUBPEL_MASK;
          subpel_y = scaled_mv.row & SUBPEL_MASK;
          pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride +
                 (scaled_mv.col >> SUBPEL_BITS);

#if CONFIG_EXT_INTER
          if (ref &&
              is_masked_compound_type(mi->mbmi.interinter_compound_data.type))
            av1_make_masked_inter_predictor(
                pre, pre_buf->stride, dst, dst_buf->stride, subpel_x, subpel_y,
                sf, w, h, mi->mbmi.interp_filter, xs, ys,
#if CONFIG_SUPERTX
                wedge_offset_x, wedge_offset_y,
#endif  // CONFIG_SUPERTX
                plane,
#if CONFIG_GLOBAL_MOTION
                is_global[ref], (mi_x >> pd->subsampling_x) + x,
                (mi_y >> pd->subsampling_y) + y, ref,
#endif  // CONFIG_GLOBAL_MOTION
                xd);
          else
#endif  // CONFIG_EXT_INTER
            av1_make_inter_predictor(
                pre, pre_buf->stride, dst, dst_buf->stride, subpel_x, subpel_y,
                sf, x_step, y_step, &conv_params, mi->mbmi.interp_filter,
#if CONFIG_GLOBAL_MOTION
                is_global[ref], (mi_x >> pd->subsampling_x) + x,
                (mi_y >> pd->subsampling_y) + y, plane, ref,
#if CONFIG_MOTION_VAR
                mi_col_offset, mi_row_offset,
#endif
#endif  // CONFIG_GLOBAL_MOTION
                xs, ys, xd);
        }
      }
    }
    return;
  }
#endif

  {
    struct buf_2d *const dst_buf = &pd->dst;
    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
    uint8_t *pre[2];
    MV32 scaled_mv[2];
    SubpelParams subpel_params[2];
#if CONFIG_CONVOLVE_ROUND
    DECLARE_ALIGNED(16, int32_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
    av1_zero(tmp_dst);
#endif  // CONFIG_CONVOLVE_ROUND

    for (ref = 0; ref < 1 + is_compound; ++ref) {
      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
      struct buf_2d *const pre_buf = &pd->pre[ref];
#if CONFIG_CB4X4
      const MV mv = mi->mbmi.mv[ref].as_mv;
#else
      const MV mv =
#if CONFIG_MOTION_VAR
          (mi->mbmi.sb_type < BLOCK_8X8 && !build_for_obmc)
              ?
#else
          mi->mbmi.sb_type < BLOCK_8X8 ?
#endif
              average_split_mvs(pd, mi, ref, block)
              : mi->mbmi.mv[ref].as_mv;
#endif

      // TODO(jkoleszar): This clamping is done in the incorrect place for the
      // scaling case. It needs to be done on the scaled MV, not the pre-scaling
      // MV. Note however that it performs the subsampling aware scaling so
      // that the result is always q4.
      // mv_precision precision is MV_PRECISION_Q4.
      const MV mv_q4 = clamp_mv_to_umv_border_sb(
          xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);

      const int is_scaled = av1_is_scaled(sf);

      if (is_scaled) {
        pre[ref] =
            pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
        scaled_mv[ref] = av1_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
        subpel_params[ref].xs = sf->x_step_q4;
        subpel_params[ref].ys = sf->y_step_q4;
      } else {
        pre[ref] = pre_buf->buf + (y * pre_buf->stride + x);
        scaled_mv[ref].row = mv_q4.row;
        scaled_mv[ref].col = mv_q4.col;
        subpel_params[ref].xs = 16;
        subpel_params[ref].ys = 16;
      }

      subpel_params[ref].subpel_x = scaled_mv[ref].col & SUBPEL_MASK;
      subpel_params[ref].subpel_y = scaled_mv[ref].row & SUBPEL_MASK;
      pre[ref] += (scaled_mv[ref].row >> SUBPEL_BITS) * pre_buf->stride +
                  (scaled_mv[ref].col >> SUBPEL_BITS);
    }

#if CONFIG_CONVOLVE_ROUND
    ConvolveParams conv_params =
        get_conv_params_no_round(ref, plane, tmp_dst, MAX_SB_SIZE);
#else
    ConvolveParams conv_params = get_conv_params(ref, plane);
#endif  // CONFIG_CONVOLVE_ROUND
    for (ref = 0; ref < 1 + is_compound; ++ref) {
      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
      struct buf_2d *const pre_buf = &pd->pre[ref];
      conv_params.ref = ref;
#if CONFIG_EXT_INTER
      if (ref &&
          is_masked_compound_type(mi->mbmi.interinter_compound_data.type))
        av1_make_masked_inter_predictor(
            pre[ref], pre_buf->stride, dst, dst_buf->stride,
            subpel_params[ref].subpel_x, subpel_params[ref].subpel_y, sf, w, h,
            mi->mbmi.interp_filter, subpel_params[ref].xs,
            subpel_params[ref].ys,
#if CONFIG_SUPERTX
            wedge_offset_x, wedge_offset_y,
#endif  // CONFIG_SUPERTX
            plane,
#if CONFIG_GLOBAL_MOTION
            is_global[ref], (mi_x >> pd->subsampling_x) + x,
            (mi_y >> pd->subsampling_y) + y, ref,
#endif  // CONFIG_GLOBAL_MOTION
            xd);
      else
#endif  // CONFIG_EXT_INTER
        av1_make_inter_predictor(
            pre[ref], pre_buf->stride, dst, dst_buf->stride,
            subpel_params[ref].subpel_x, subpel_params[ref].subpel_y, sf, w, h,
            &conv_params, mi->mbmi.interp_filter,
#if CONFIG_GLOBAL_MOTION
            is_global[ref], (mi_x >> pd->subsampling_x) + x,
            (mi_y >> pd->subsampling_y) + y, plane, ref,
#if CONFIG_MOTION_VAR
            mi_col_offset, mi_row_offset,
#endif
#endif  // CONFIG_GLOBAL_MOTION
            subpel_params[ref].xs, subpel_params[ref].ys, xd);
    }

#if CONFIG_CONVOLVE_ROUND
// TODO(angiebird): This part needs optimization
#if CONFIG_AOM_HIGHBITDEPTH
    if (!(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH))
#endif  // CONFIG_AOM_HIGHBITDEPTH
      av1_convolve_rounding(tmp_dst, MAX_SB_SIZE, dst, dst_buf->stride, w, h,
                            FILTER_BITS * 2 + is_compound -
                                conv_params.round_0 - conv_params.round_1);
#endif  // CONFIG_CONVOLVE_ROUND
  }
}

void av1_build_inter_predictor_sub8x8(MACROBLOCKD *xd, int plane, int i, int ir,
                                      int ic, int mi_row, int mi_col) {
  struct macroblockd_plane *const pd = &xd->plane[plane];
  MODE_INFO *const mi = xd->mi[0];
  const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
  const int width = block_size_wide[plane_bsize];
  const int height = block_size_high[plane_bsize];
  uint8_t *const dst = &pd->dst.buf[(ir * pd->dst.stride + ic) << 2];
  int ref;
  const int is_compound = has_second_ref(&mi->mbmi);
#if CONFIG_GLOBAL_MOTION
  const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
  const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
  int is_global[2];
  for (ref = 0; ref < 1 + is_compound; ++ref) {
    WarpedMotionParams *const wm = &xd->global_motion[mi->mbmi.ref_frame[ref]];
    is_global[ref] = is_global_mv_block(mi, i, wm->wmtype);
  }
#endif  // CONFIG_GLOBAL_MOTION

  for (ref = 0; ref < 1 + is_compound; ++ref) {
    ConvolveParams conv_params = get_conv_params(ref, plane);
    const uint8_t *pre =
        &pd->pre[ref].buf[(ir * pd->pre[ref].stride + ic) << 2];
#if CONFIG_AOM_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
      av1_highbd_build_inter_predictor(
          pre, pd->pre[ref].stride, dst, pd->dst.stride,
          &mi->bmi[i].as_mv[ref].as_mv, &xd->block_refs[ref]->sf, width, height,
          ref, mi->mbmi.interp_filter,
#if CONFIG_GLOBAL_MOTION
          is_global[ref], p_col, p_row,
#endif  // CONFIG_GLOBAL_MOTION
          plane, MV_PRECISION_Q3, mi_col * MI_SIZE + 4 * ic,
          mi_row * MI_SIZE + 4 * ir, xd);
    else
#endif  // CONFIG_AOM_HIGHBITDEPTH
      av1_build_inter_predictor(pre, pd->pre[ref].stride, dst, pd->dst.stride,
                                &mi->bmi[i].as_mv[ref].as_mv,
                                &xd->block_refs[ref]->sf, width, height,
                                &conv_params, mi->mbmi.interp_filter,
#if CONFIG_GLOBAL_MOTION
                                is_global[ref], p_col, p_row, plane, ref,
#endif  // CONFIG_GLOBAL_MOTION
                                MV_PRECISION_Q3, mi_col * MI_SIZE + 4 * ic,
                                mi_row * MI_SIZE + 4 * ir, xd);
  }
}

static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                              int mi_row, int mi_col,
                                              int plane_from, int plane_to) {
  int plane;
  const int mi_x = mi_col * MI_SIZE;
  const int mi_y = mi_row * MI_SIZE;
#if CONFIG_CB4X4
  const int unify_bsize = 1;
#else
  const int unify_bsize = 0;
#endif
  for (plane = plane_from; plane <= plane_to; ++plane) {
    const struct macroblockd_plane *pd = &xd->plane[plane];
    const int bw = pd->width;
    const int bh = pd->height;

#if CONFIG_CB4X4
    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
                             pd->subsampling_y))
      continue;
#endif

    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8 && !unify_bsize) {
      const PARTITION_TYPE bp = bsize - xd->mi[0]->mbmi.sb_type;
      const int have_vsplit = bp != PARTITION_HORZ;
      const int have_hsplit = bp != PARTITION_VERT;
      const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
      const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
      const int pw = 8 >> (have_vsplit | pd->subsampling_x);
      const int ph = 8 >> (have_hsplit | pd->subsampling_y);
      int x, y;
      assert(bp != PARTITION_NONE && bp < PARTITION_TYPES);
      assert(bsize == BLOCK_8X8);
      assert(pw * num_4x4_w == bw && ph * num_4x4_h == bh);
      for (y = 0; y < num_4x4_h; ++y)
        for (x = 0; x < num_4x4_w; ++x)
          build_inter_predictors(xd, plane,
#if CONFIG_MOTION_VAR
                                 0, 0,
#endif  // CONFIG_MOTION_VAR
                                 y * 2 + x, bw, bh, 4 * x, 4 * y, pw, ph,
#if CONFIG_SUPERTX && CONFIG_EXT_INTER
                                 0, 0,
#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
                                 mi_x, mi_y);
    } else {
      build_inter_predictors(xd, plane,
#if CONFIG_MOTION_VAR
                             0, 0,
#endif  // CONFIG_MOTION_VAR
                             0, bw, bh, 0, 0, bw, bh,
#if CONFIG_SUPERTX && CONFIG_EXT_INTER
                             0, 0,
#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
                             mi_x, mi_y);
    }
  }
}

void av1_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BUFFER_SET *ctx, BLOCK_SIZE bsize) {
  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0);
#if CONFIG_EXT_INTER
  if (is_interintra_pred(&xd->mi[0]->mbmi)) {
    BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
                               { xd->plane[0].dst.stride, 0, 0 } };
    if (!ctx) ctx = &default_ctx;
    av1_build_interintra_predictors_sby(xd, xd->plane[0].dst.buf,
                                        xd->plane[0].dst.stride, ctx, bsize);
  }
#else
  (void)ctx;
#endif  // CONFIG_EXT_INTER
}

void av1_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
                                     BUFFER_SET *ctx, BLOCK_SIZE bsize) {
  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1,
                                    MAX_MB_PLANE - 1);
#if CONFIG_EXT_INTER
  if (is_interintra_pred(&xd->mi[0]->mbmi)) {
    BUFFER_SET default_ctx = {
      { NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
      { 0, xd->plane[1].dst.stride, xd->plane[2].dst.stride }
    };
    if (!ctx) ctx = &default_ctx;
    av1_build_interintra_predictors_sbuv(
        xd, xd->plane[1].dst.buf, xd->plane[2].dst.buf, xd->plane[1].dst.stride,
        xd->plane[2].dst.stride, ctx, bsize);
  }
#else
  (void)ctx;
#endif  // CONFIG_EXT_INTER
}

// TODO(afergs): Check if ctx can be made constant
void av1_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                   BUFFER_SET *ctx, BLOCK_SIZE bsize) {
  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0,
                                    MAX_MB_PLANE - 1);
#if CONFIG_EXT_INTER
  if (is_interintra_pred(&xd->mi[0]->mbmi)) {
    BUFFER_SET default_ctx = {
      { xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
      { xd->plane[0].dst.stride, xd->plane[1].dst.stride,
        xd->plane[2].dst.stride }
    };
    if (!ctx) ctx = &default_ctx;
    av1_build_interintra_predictors(
        xd, xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
        xd->plane[0].dst.stride, xd->plane[1].dst.stride,
        xd->plane[2].dst.stride, ctx, bsize);
  }
#else
  (void)ctx;
#endif  // CONFIG_EXT_INTER
}

void av1_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
                          const YV12_BUFFER_CONFIG *src, int mi_row,
                          int mi_col) {
  uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
                                           src->v_buffer };
  const int widths[MAX_MB_PLANE] = { src->y_crop_width, src->uv_crop_width,
                                     src->uv_crop_width };
  const int heights[MAX_MB_PLANE] = { src->y_crop_height, src->uv_crop_height,
                                      src->uv_crop_height };
  const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
                                      src->uv_stride };
  int i;

  for (i = 0; i < MAX_MB_PLANE; ++i) {
    struct macroblockd_plane *const pd = &planes[i];
    setup_pred_plane(&pd->dst, buffers[i], widths[i], heights[i], strides[i],
                     mi_row, mi_col, NULL, pd->subsampling_x,
                     pd->subsampling_y);
  }
}

void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                          const struct scale_factors *sf) {
  if (src != NULL) {
    int i;
    uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
                                             src->v_buffer };
    const int widths[MAX_MB_PLANE] = { src->y_crop_width, src->uv_crop_width,
                                       src->uv_crop_width };
    const int heights[MAX_MB_PLANE] = { src->y_crop_height, src->uv_crop_height,
                                        src->uv_crop_height };
    const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
                                        src->uv_stride };
    for (i = 0; i < MAX_MB_PLANE; ++i) {
      struct macroblockd_plane *const pd = &xd->plane[i];
      setup_pred_plane(&pd->pre[idx], buffers[i], widths[i], heights[i],
                       strides[i], mi_row, mi_col, sf, pd->subsampling_x,
                       pd->subsampling_y);
    }
  }
}

#if CONFIG_SUPERTX
#if CONFIG_CB4X4
static const uint8_t mask_4[4] = { 64, 52, 12, 0 };
static const uint8_t mask_4_uv[4] = { 64, 52, 12, 0 };
#endif  // CONFIG_CB4X4
static const uint8_t mask_8[8] = { 64, 64, 62, 52, 12, 2, 0, 0 };

static const uint8_t mask_16[16] = { 63, 62, 60, 58, 55, 50, 43, 36,
                                     28, 21, 14, 9,  6,  4,  2,  1 };

static const uint8_t mask_32[32] = { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63,
                                     61, 57, 52, 45, 36, 28, 19, 12, 7,  3,  1,
                                     0,  0,  0,  0,  0,  0,  0,  0,  0,  0 };

static const uint8_t mask_8_uv[8] = { 64, 64, 62, 52, 12, 2, 0, 0 };

static const uint8_t mask_16_uv[16] = { 64, 64, 64, 64, 61, 53, 45, 36,
                                        28, 19, 11, 3,  0,  0,  0,  0 };

static const uint8_t mask_32_uv[32] = { 64, 64, 64, 64, 64, 64, 64, 64,
                                        64, 64, 64, 64, 60, 54, 46, 36,
                                        28, 18, 10, 4,  0,  0,  0,  0,
                                        0,  0,  0,  0,  0,  0,  0,  0 };

static const uint8_t *get_supertx_mask(int length, int plane) {
  switch (length) {
#if CONFIG_CB4X4
    case 4: return plane ? mask_4_uv : mask_4;
#endif  // CONFIG_CB4X4
    case 8: return plane ? mask_8_uv : mask_8;
    case 16: return plane ? mask_16_uv : mask_16;
    case 32: return plane ? mask_32_uv : mask_32;
    default: assert(0);
  }
  return NULL;
}

void av1_build_masked_inter_predictor_complex(
    MACROBLOCKD *xd, uint8_t *dst, int dst_stride, const uint8_t *pre,
    int pre_stride, int mi_row, int mi_col, int mi_row_ori, int mi_col_ori,
    BLOCK_SIZE bsize, BLOCK_SIZE top_bsize, PARTITION_TYPE partition,
    int plane) {
  const struct macroblockd_plane *pd = &xd->plane[plane];
  const int ssx = pd->subsampling_x;
  const int ssy = pd->subsampling_y;
  const int top_w = block_size_wide[top_bsize] >> ssx;
  const int top_h = block_size_high[top_bsize] >> ssy;
  const int w = block_size_wide[bsize] >> ssx;
  const int h = block_size_high[bsize] >> ssy;
  const int w_offset = ((mi_col - mi_col_ori) * MI_SIZE) >> ssx;
  const int h_offset = ((mi_row - mi_row_ori) * MI_SIZE) >> ssy;

  int w_remain, h_remain;

#if CONFIG_AOM_HIGHBITDEPTH
  const int is_hdb = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
#endif  // CONFIG_AOM_HIGHBITDEPTH

  assert(bsize <= BLOCK_32X32);
  assert(IMPLIES(plane == 0, ssx == 0));
  assert(IMPLIES(plane == 0, ssy == 0));

  switch (partition) {
    case PARTITION_HORZ: {
      const uint8_t *const mask = get_supertx_mask(h, ssy);

      w_remain = top_w;
      h_remain = top_h - h_offset - h;
      dst += h_offset * dst_stride;
      pre += h_offset * pre_stride;

#if CONFIG_AOM_HIGHBITDEPTH
      if (is_hdb)
        aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, pre,
                                   pre_stride, mask, h, top_w, xd->bd);
      else
#endif  // CONFIG_AOM_HIGHBITDEPTH
        aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, pre, pre_stride,
                            mask, h, top_w);

      dst += h * dst_stride;
      pre += h * pre_stride;
      break;
    }
    case PARTITION_VERT: {
      const uint8_t *const mask = get_supertx_mask(w, ssx);

      w_remain = top_w - w_offset - w;
      h_remain = top_h;
      dst += w_offset;
      pre += w_offset;

#if CONFIG_AOM_HIGHBITDEPTH
      if (is_hdb)
        aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, pre,
                                   pre_stride, mask, top_h, w, xd->bd);
      else
#endif  // CONFIG_AOM_HIGHBITDEPTH
        aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, pre, pre_stride,
                            mask, top_h, w);

      dst += w;
      pre += w;
      break;
    }
    default: {
      assert(0);
      return;
    }
  }

  if (w_remain == 0 || h_remain == 0) {
    return;
  }

#if CONFIG_AOM_HIGHBITDEPTH
  if (is_hdb) {
    dst = (uint8_t *)CONVERT_TO_SHORTPTR(dst);
    pre = (const uint8_t *)CONVERT_TO_SHORTPTR(pre);
    dst_stride *= 2;
    pre_stride *= 2;
    w_remain *= 2;
  }
#endif  // CONFIG_AOM_HIGHBITDEPTH

  do {
    memcpy(dst, pre, w_remain * sizeof(uint8_t));
    dst += dst_stride;
    pre += pre_stride;
  } while (--h_remain);
}

void av1_build_inter_predictors_sb_sub8x8_extend(MACROBLOCKD *xd,
#if CONFIG_EXT_INTER
                                                 int mi_row_ori, int mi_col_ori,
#endif  // CONFIG_EXT_INTER
                                                 int mi_row, int mi_col,
                                                 BLOCK_SIZE bsize, int block) {
  // Prediction function used in supertx:
  // Use the mv at current block (which is less than 8x8)
  // to get prediction of a block located at (mi_row, mi_col) at size of bsize
  // bsize can be larger than 8x8.
  // block (0-3): the sub8x8 location of current block
  int plane;
  const int mi_x = mi_col * MI_SIZE;
  const int mi_y = mi_row * MI_SIZE;
#if CONFIG_EXT_INTER
  const int wedge_offset_x = (mi_col_ori - mi_col) * MI_SIZE;
  const int wedge_offset_y = (mi_row_ori - mi_row) * MI_SIZE;
#endif  // CONFIG_EXT_INTER

  // For sub8x8 uv:
  // Skip uv prediction in supertx except the first block (block = 0)
  int max_plane = block ? 1 : MAX_MB_PLANE;

  for (plane = 0; plane < max_plane; plane++) {
    const BLOCK_SIZE plane_bsize =
        get_plane_block_size(bsize, &xd->plane[plane]);
    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
    const int bw = 4 * num_4x4_w;
    const int bh = 4 * num_4x4_h;

    build_inter_predictors(xd, plane,
#if CONFIG_MOTION_VAR
                           0, 0,
#endif  // CONFIG_MOTION_VAR
                           block, bw, bh, 0, 0, bw, bh,
#if CONFIG_EXT_INTER
                           wedge_offset_x, wedge_offset_y,
#endif  // CONFIG_EXT_INTER
                           mi_x, mi_y);
  }
#if CONFIG_EXT_INTER
  if (is_interintra_pred(&xd->mi[0]->mbmi)) {
    BUFFER_SET ctx = { { xd->plane[0].dst.buf, xd->plane[1].dst.buf,
                         xd->plane[2].dst.buf },
                       { xd->plane[0].dst.stride, xd->plane[1].dst.stride,
                         xd->plane[2].dst.stride } };
    av1_build_interintra_predictors(
        xd, xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
        xd->plane[0].dst.stride, xd->plane[1].dst.stride,
        xd->plane[2].dst.stride, &ctx, bsize);
  }
#endif  // CONFIG_EXT_INTER
}

void av1_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
#if CONFIG_EXT_INTER
                                          int mi_row_ori, int mi_col_ori,
#endif  // CONFIG_EXT_INTER
                                          int mi_row, int mi_col,
                                          BLOCK_SIZE bsize) {
  int plane;
  const int mi_x = mi_col * MI_SIZE;
  const int mi_y = mi_row * MI_SIZE;
#if CONFIG_EXT_INTER
  const int wedge_offset_x = (mi_col_ori - mi_col) * MI_SIZE;
  const int wedge_offset_y = (mi_row_ori - mi_row) * MI_SIZE;
#endif  // CONFIG_EXT_INTER
  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
    const BLOCK_SIZE plane_bsize =
        get_plane_block_size(bsize, &xd->plane[plane]);
    const int bw = block_size_wide[plane_bsize];
    const int bh = block_size_high[plane_bsize];

    build_inter_predictors(xd, plane,
#if CONFIG_MOTION_VAR
                           0, 0,
#endif  // CONFIG_MOTION_VAR
                           0, bw, bh, 0, 0, bw, bh,
#if CONFIG_EXT_INTER
                           wedge_offset_x, wedge_offset_y,
#endif  // CONFIG_EXT_INTER
                           mi_x, mi_y);
  }
}
#endif  // CONFIG_SUPERTX

#if CONFIG_MOTION_VAR
// obmc_mask_N[overlap_position]
static const uint8_t obmc_mask_1[1] = { 64 };

static const uint8_t obmc_mask_2[2] = { 45, 64 };

static const uint8_t obmc_mask_4[4] = { 39, 50, 59, 64 };

static const uint8_t obmc_mask_8[8] = { 36, 42, 48, 53, 57, 61, 64, 64 };

static const uint8_t obmc_mask_16[16] = { 34, 37, 40, 43, 46, 49, 52, 54,
                                          56, 58, 60, 61, 64, 64, 64, 64 };

static const uint8_t obmc_mask_32[32] = { 33, 35, 36, 38, 40, 41, 43, 44,
                                          45, 47, 48, 50, 51, 52, 53, 55,
                                          56, 57, 58, 59, 60, 60, 61, 62,
                                          64, 64, 64, 64, 64, 64, 64, 64 };

#if CONFIG_EXT_PARTITION
static const uint8_t obmc_mask_64[64] = {
  33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44,
  45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56,
  56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62,
  62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
};
#endif  // CONFIG_EXT_PARTITION

const uint8_t *av1_get_obmc_mask(int length) {
  switch (length) {
    case 1: return obmc_mask_1;
    case 2: return obmc_mask_2;
    case 4: return obmc_mask_4;
    case 8: return obmc_mask_8;
    case 16: return obmc_mask_16;
    case 32: return obmc_mask_32;
#if CONFIG_EXT_PARTITION
    case 64: return obmc_mask_64;
#endif  // CONFIG_EXT_PARTITION
    default: assert(0); return NULL;
  }
}

#if CONFIG_NCOBMC
// obmc_mask_flipN[overlap_position]
static const uint8_t obmc_mask_flip1[1] = { 55 };

static const uint8_t obmc_mask_flip2[2] = { 62, 45 };

static const uint8_t obmc_mask_flip4[4] = { 64, 59, 50, 39 };

static const uint8_t obmc_mask_flip8[8] = { 64, 63, 61, 57, 53, 48, 42, 36 };

static const uint8_t obmc_mask_flip16[16] = { 64, 64, 64, 63, 61, 60, 58, 56,
                                              54, 52, 49, 46, 43, 40, 37, 34 };

static const uint8_t obmc_mask_flip32[32] = { 64, 64, 64, 64, 64, 63, 63, 62,
                                              62, 61, 60, 60, 59, 58, 57, 56,
                                              55, 53, 52, 51, 50, 48, 47, 45,
                                              44, 43, 41, 40, 38, 36, 35, 33 };

#if CONFIG_EXT_PARTITION
static const uint8_t obmc_mask_flip64[64] = {
  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 62, 62,
  62, 62, 62, 61, 60, 60, 60, 60, 60, 59, 58, 58, 57, 57, 56, 56,
  56, 55, 54, 53, 52, 52, 51, 51, 51, 50, 49, 48, 47, 47, 46, 45,
  44, 44, 44, 43, 42, 41, 40, 40, 39, 38, 37, 36, 35, 35, 34, 33,
};
#endif  // CONFIG_EXT_PARTITION

const uint8_t *av1_get_obmc_mask_flipped(int length) {
  switch (length) {
    case 1: return obmc_mask_flip1;
    case 2: return obmc_mask_flip2;
    case 4: return obmc_mask_flip4;
    case 8: return obmc_mask_flip8;
    case 16: return obmc_mask_flip16;
    case 32: return obmc_mask_flip32;
#if CONFIG_EXT_PARTITION
    case 64: return obmc_mask_flip64;
#endif  // CONFIG_EXT_PARTITION
    default: assert(0); return NULL;
  }
}
#endif  // CONFIG_NCOBMC

void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                      int mi_row, int mi_col) {
  int i, mi_step;

  xd->mi[0]->mbmi.overlappable_neighbors[0] = 0;
  xd->mi[0]->mbmi.overlappable_neighbors[1] = 0;

  if (xd->up_available) {
    const int ilimit = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
    for (i = 0; i < ilimit; i += mi_step) {
      int mi_row_offset = -1;
      int mi_col_offset = i;
      MODE_INFO *above_mi =
          xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
      MB_MODE_INFO *above_mbmi = &above_mi->mbmi;

      mi_step = AOMMIN(xd->n8_w, mi_size_wide[above_mbmi->sb_type]);

      if (is_neighbor_overlappable(above_mbmi))
        xd->mi[0]->mbmi.overlappable_neighbors[0]++;
    }
  }

  if (xd->left_available) {
    const int ilimit = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
    for (i = 0; i < ilimit; i += mi_step) {
      int mi_row_offset = i;
      int mi_col_offset = -1;
      MODE_INFO *left_mi =
          xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
      MB_MODE_INFO *left_mbmi = &left_mi->mbmi;

      mi_step = AOMMIN(xd->n8_h, mi_size_high[left_mbmi->sb_type]);

      if (is_neighbor_overlappable(left_mbmi))
        xd->mi[0]->mbmi.overlappable_neighbors[1]++;
    }
  }
}

// This function combines motion compensated predictions that is generated by
// top/left neighboring blocks' inter predictors with the regular inter
// prediction. We assume the original prediction (bmc) is stored in
// xd->plane[].dst.buf
void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                     int mi_row, int mi_col,
                                     uint8_t *above[MAX_MB_PLANE],
                                     int above_stride[MAX_MB_PLANE],
                                     uint8_t *left[MAX_MB_PLANE],
                                     int left_stride[MAX_MB_PLANE]) {
  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
  int plane, i;
#if CONFIG_AOM_HIGHBITDEPTH
  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
#endif  // CONFIG_AOM_HIGHBITDEPTH

  // handle above row
  if (xd->up_available) {
    const int overlap = num_4x4_blocks_high_lookup[bsize] * 2;
    const int miw = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
    const int mi_row_offset = -1;
    const int neighbor_limit = max_neighbor_obmc[b_width_log2_lookup[bsize]];
    int neighbor_count = 0;

    assert(miw > 0);

    i = 0;
    do {  // for each mi in the above row
      const int mi_col_offset = i;
      const MB_MODE_INFO *const above_mbmi =
          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
      const BLOCK_SIZE a_bsize = above_mbmi->sb_type;
      const int mi_step = AOMMIN(xd->n8_w, mi_size_wide[a_bsize]);

      if (is_neighbor_overlappable(above_mbmi)) {
        neighbor_count++;
        if (neighbor_count > neighbor_limit) break;
        for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
          const struct macroblockd_plane *pd = &xd->plane[plane];
          const int bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
          const int bh = overlap >> pd->subsampling_y;
          const int dst_stride = pd->dst.stride;
          uint8_t *const dst = &pd->dst.buf[(i * MI_SIZE) >> pd->subsampling_x];
          const int tmp_stride = above_stride[plane];
          const uint8_t *const tmp =
              &above[plane][(i * MI_SIZE) >> pd->subsampling_x];
          const uint8_t *const mask = av1_get_obmc_mask(bh);

#if CONFIG_AOM_HIGHBITDEPTH
          if (is_hbd)
            aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
                                       tmp_stride, mask, bh, bw, xd->bd);
          else
#endif  // CONFIG_AOM_HIGHBITDEPTH
            aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
                                tmp_stride, mask, bh, bw);
        }
      }
      i += mi_step;
    } while (i < miw);
  }

  // handle left column
  if (xd->left_available) {
    const int overlap = num_4x4_blocks_wide_lookup[bsize] * 2;
    const int mih = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
    const int mi_col_offset = -1;
    const int neighbor_limit = max_neighbor_obmc[b_height_log2_lookup[bsize]];
    int neighbor_count = 0;

    assert(mih > 0);

    i = 0;
    do {  // for each mi in the left column
      const int mi_row_offset = i;
      const MB_MODE_INFO *const left_mbmi =
          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
      const BLOCK_SIZE l_bsize = left_mbmi->sb_type;
      const int mi_step = AOMMIN(xd->n8_h, mi_size_high[l_bsize]);

      if (is_neighbor_overlappable(left_mbmi)) {
        neighbor_count++;
        if (neighbor_count > neighbor_limit) break;
        for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
          const struct macroblockd_plane *pd = &xd->plane[plane];
          const int bw = overlap >> pd->subsampling_x;
          const int bh = (mi_step * MI_SIZE) >> pd->subsampling_y;
          const int dst_stride = pd->dst.stride;
          uint8_t *const dst =
              &pd->dst.buf[(i * MI_SIZE * dst_stride) >> pd->subsampling_y];
          const int tmp_stride = left_stride[plane];
          const uint8_t *const tmp =
              &left[plane][(i * MI_SIZE * tmp_stride) >> pd->subsampling_y];
          const uint8_t *const mask = av1_get_obmc_mask(bw);

#if CONFIG_AOM_HIGHBITDEPTH
          if (is_hbd)
            aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
                                       tmp_stride, mask, bh, bw, xd->bd);
          else
#endif  // CONFIG_AOM_HIGHBITDEPTH
            aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
                                tmp_stride, mask, bh, bw);
        }
      }
      i += mi_step;
    } while (i < mih);
  }
}

void modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
#if CONFIG_EXT_INTER
  if (is_interintra_pred(mbmi)) {
    mbmi->ref_frame[1] = NONE_FRAME;
  } else if (has_second_ref(mbmi) &&
             is_masked_compound_type(mbmi->interinter_compound_data.type)) {
    mbmi->interinter_compound_data.type = COMPOUND_AVERAGE;
    mbmi->ref_frame[1] = NONE_FRAME;
  }
#endif  // CONFIG_EXT_INTER
  if (has_second_ref(mbmi)) mbmi->ref_frame[1] = NONE_FRAME;
  return;
}

void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                         int mi_row, int mi_col,
                                         uint8_t *tmp_buf[MAX_MB_PLANE],
                                         int tmp_width[MAX_MB_PLANE],
                                         int tmp_height[MAX_MB_PLANE],
                                         int tmp_stride[MAX_MB_PLANE]) {
  const TileInfo *const tile = &xd->tile;
  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
  int i, j, mi_step, ref;
  const int ilimit = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
  int mb_to_right_edge_base = xd->mb_to_right_edge;
  const int neighbor_limit = max_neighbor_obmc[b_width_log2_lookup[bsize]];
  int neighbor_count = 0;

  if (mi_row <= tile->mi_row_start) return;

  xd->mb_to_bottom_edge += xd->n8_h * 32;
  for (i = 0; i < ilimit; i += mi_step) {
    int mi_row_offset = -1;
    int mi_col_offset = i;
    int mi_x, mi_y, bw, bh;
    MODE_INFO *above_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
    MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
    const BLOCK_SIZE a_bsize = above_mbmi->sb_type;
    MB_MODE_INFO backup_mbmi;

    mi_step = AOMMIN(xd->n8_w, mi_size_wide[a_bsize]);

    if (!is_neighbor_overlappable(above_mbmi)) continue;

    neighbor_count++;
    if (neighbor_count > neighbor_limit) break;

    backup_mbmi = *above_mbmi;
    modify_neighbor_predictor_for_obmc(above_mbmi);

    for (j = 0; j < MAX_MB_PLANE; ++j) {
      struct macroblockd_plane *const pd = &xd->plane[j];
      setup_pred_plane(&pd->dst, tmp_buf[j], tmp_width[j], tmp_height[j],
                       tmp_stride[j], 0, i, NULL, pd->subsampling_x,
                       pd->subsampling_y);
    }
    for (ref = 0; ref < 1 + has_second_ref(above_mbmi); ++ref) {
      const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];

      xd->block_refs[ref] = ref_buf;
      if ((!av1_is_valid_scale(&ref_buf->sf)))
        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
                           "Reference frame has invalid dimensions");
      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col + i,
                           &ref_buf->sf);
    }

    xd->mb_to_left_edge = -(((mi_col + i) * MI_SIZE) * 8);
    xd->mb_to_right_edge =
        mb_to_right_edge_base + (xd->n8_w - i - mi_step) * 64;
    mi_x = (mi_col + i) << MI_SIZE_LOG2;
    mi_y = mi_row << MI_SIZE_LOG2;

    for (j = 0; j < MAX_MB_PLANE; ++j) {
      const struct macroblockd_plane *pd = &xd->plane[j];
      bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
      bh = AOMMAX((num_4x4_blocks_high_lookup[bsize] * 2) >> pd->subsampling_y,
                  4);

#if CONFIG_WARPED_MOTION
      if (above_mbmi->motion_mode == WARPED_CAUSAL &&
          WARP_NEIGHBORS_WITH_OBMC) {
        assert_motion_mode_valid(WARPED_CAUSAL,
#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
                                 0, cm->global_motion,
#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
                                 above_mi);

        av1_warp_plane(&above_mbmi->wm_params[0],
#if CONFIG_AOM_HIGHBITDEPTH
                       xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
#endif  // CONFIG_AOM_HIGHBITDEPTH
                       pd->pre[0].buf0, pd->pre[0].width, pd->pre[0].height,
                       pd->pre[0].stride, pd->dst.buf,
                       (((mi_col + i) * MI_SIZE) >> pd->subsampling_x),
                       ((mi_row * MI_SIZE) >> pd->subsampling_y), bw, bh,
                       pd->dst.stride, pd->subsampling_x, pd->subsampling_y, 16,
                       16, 0);

      } else {
#endif  // CONFIG_WARPED_MOTION
        build_inter_predictors(xd, j, mi_col_offset, mi_row_offset, 0, bw, bh,
                               0, 0, bw, bh,
#if CONFIG_SUPERTX && CONFIG_EXT_INTER
                               0, 0,
#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
                               mi_x, mi_y);
#if CONFIG_WARPED_MOTION
      }
#endif  // CONFIG_WARPED_MOTION
    }
    *above_mbmi = backup_mbmi;
  }
  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
  xd->mb_to_right_edge = mb_to_right_edge_base;
  xd->mb_to_bottom_edge -= xd->n8_h * 32;
}

void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                        int mi_row, int mi_col,
                                        uint8_t *tmp_buf[MAX_MB_PLANE],
                                        int tmp_width[MAX_MB_PLANE],
                                        int tmp_height[MAX_MB_PLANE],
                                        int tmp_stride[MAX_MB_PLANE]) {
  const TileInfo *const tile = &xd->tile;
  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
  int i, j, mi_step, ref;
  const int ilimit = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
  int mb_to_bottom_edge_base = xd->mb_to_bottom_edge;
  const int neighbor_limit = max_neighbor_obmc[b_height_log2_lookup[bsize]];
  int neighbor_count = 0;

  if (mi_col == 0 || (mi_col - 1 < tile->mi_col_start)) return;

  xd->mb_to_right_edge += xd->n8_w * 32;
  for (i = 0; i < ilimit; i += mi_step) {
    int mi_row_offset = i;
    int mi_col_offset = -1;
    int mi_x, mi_y, bw, bh;
    MODE_INFO *left_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
    MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
    const BLOCK_SIZE l_bsize = left_mbmi->sb_type;
    MB_MODE_INFO backup_mbmi;

    mi_step = AOMMIN(xd->n8_h, mi_size_high[l_bsize]);

    if (!is_neighbor_overlappable(left_mbmi)) continue;

    neighbor_count++;
    if (neighbor_count > neighbor_limit) break;

    backup_mbmi = *left_mbmi;
    modify_neighbor_predictor_for_obmc(left_mbmi);

    for (j = 0; j < MAX_MB_PLANE; ++j) {
      struct macroblockd_plane *const pd = &xd->plane[j];
      setup_pred_plane(&pd->dst, tmp_buf[j], tmp_width[j], tmp_height[j],
                       tmp_stride[j], i, 0, NULL, pd->subsampling_x,
                       pd->subsampling_y);
    }
    for (ref = 0; ref < 1 + has_second_ref(left_mbmi); ++ref) {
      const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];

      xd->block_refs[ref] = ref_buf;
      if ((!av1_is_valid_scale(&ref_buf->sf)))
        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
                           "Reference frame has invalid dimensions");
      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + i, mi_col,
                           &ref_buf->sf);
    }

    xd->mb_to_top_edge = -(((mi_row + i) * MI_SIZE) * 8);
    xd->mb_to_bottom_edge =
        mb_to_bottom_edge_base + (xd->n8_h - i - mi_step) * 64;
    mi_x = mi_col << MI_SIZE_LOG2;
    mi_y = (mi_row + i) << MI_SIZE_LOG2;

    for (j = 0; j < MAX_MB_PLANE; ++j) {
      const struct macroblockd_plane *pd = &xd->plane[j];
      bw = AOMMAX((num_4x4_blocks_wide_lookup[bsize] * 2) >> pd->subsampling_x,
                  4);
      bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;

#if CONFIG_WARPED_MOTION
      if (left_mbmi->motion_mode == WARPED_CAUSAL && WARP_NEIGHBORS_WITH_OBMC) {
        assert_motion_mode_valid(WARPED_CAUSAL,
#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
                                 0, cm->global_motion,
#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
                                 left_mi);

        av1_warp_plane(&left_mbmi->wm_params[0],
#if CONFIG_AOM_HIGHBITDEPTH
                       xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
#endif  // CONFIG_AOM_HIGHBITDEPTH
                       pd->pre[0].buf0, pd->pre[0].width, pd->pre[0].height,
                       pd->pre[0].stride, pd->dst.buf,
                       ((mi_col * MI_SIZE) >> pd->subsampling_x),
                       (((mi_row + i) * MI_SIZE) >> pd->subsampling_y), bw, bh,
                       pd->dst.stride, pd->subsampling_x, pd->subsampling_y, 16,
                       16, 0);

      } else {
#endif  // CONFIG_WARPED_MOTION
        build_inter_predictors(xd, j, mi_col_offset, mi_row_offset, 0, bw, bh,
                               0, 0, bw, bh,
#if CONFIG_SUPERTX && CONFIG_EXT_INTER
                               0, 0,
#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
                               mi_x, mi_y);
#if CONFIG_WARPED_MOTION
      }
#endif  // CONFIG_WARPED_MOTION
    }
    *left_mbmi = backup_mbmi;
  }
  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
  xd->mb_to_bottom_edge = mb_to_bottom_edge_base;
  xd->mb_to_right_edge -= xd->n8_w * 32;
}

void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                        int mi_row, int mi_col) {
#if CONFIG_AOM_HIGHBITDEPTH
  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
#else
  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
#endif  // CONFIG_AOM_HIGHBITDEPTH
  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };

#if CONFIG_AOM_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    int len = sizeof(uint16_t);
    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
  } else {
#endif  // CONFIG_AOM_HIGHBITDEPTH
    dst_buf1[0] = tmp_buf1;
    dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
    dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
    dst_buf2[0] = tmp_buf2;
    dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
    dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
#if CONFIG_AOM_HIGHBITDEPTH
  }
#endif  // CONFIG_AOM_HIGHBITDEPTH
  av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
                                      dst_width1, dst_height1, dst_stride1);
  av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
                                     dst_width2, dst_height2, dst_stride2);
  av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
  av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
                                  dst_buf2, dst_stride2);
}

#if CONFIG_NCOBMC
void av1_build_prediction_by_bottom_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          int mi_row, int mi_col,
                                          uint8_t *tmp_buf[MAX_MB_PLANE],
                                          int tmp_width[MAX_MB_PLANE],
                                          int tmp_height[MAX_MB_PLANE],
                                          int tmp_stride[MAX_MB_PLANE]) {
  const TileInfo *const tile = &xd->tile;
  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
  int i, j, mi_step, ref;
  const int ilimit = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
  int mb_to_right_edge_base = xd->mb_to_right_edge;

  if (mi_row + xd->n8_h >= tile->mi_row_end ||
      (mi_row + xd->n8_h) % MI_SIZE == 0 || (mi_row + xd->n8_h) >= cm->mi_rows)
    return;
  assert(bsize >= BLOCK_8X8);

  xd->mb_to_top_edge -= xd->n8_h * 32;
  for (i = 0; i < ilimit; i += mi_step) {
    int mi_row_offset = xd->n8_h;
    int mi_col_offset = i;
    int mi_x, mi_y, bw, bh;
    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
    MB_MODE_INFO *mbmi = &mi->mbmi;
#if CONFIG_EXT_INTER
    MB_MODE_INFO backup_mbmi;
#endif  // CONFIG_EXT_INTER

    mi_step = AOMMIN(xd->n8_w, mi_size_wide[mbmi->sb_type]);

    if (!is_neighbor_overlappable(mbmi)) continue;

#if CONFIG_EXT_INTER
    backup_mbmi = *mbmi;
    modify_neighbor_predictor_for_obmc(mbmi);
#endif  // CONFIG_EXT_INTER

    for (j = 0; j < MAX_MB_PLANE; ++j) {
      struct macroblockd_plane *const pd = &xd->plane[j];
      setup_pred_plane(&pd->dst, tmp_buf[j], tmp_width[j], tmp_height[j],
                       tmp_stride[j], (xd->n8_h >> 1), i, NULL,
                       pd->subsampling_x, pd->subsampling_y);
    }
    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
      const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];

      xd->block_refs[ref] = ref_buf;
      if ((!av1_is_valid_scale(&ref_buf->sf)))
        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
                           "Reference frame has invalid dimensions");
      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + (xd->n8_h >> 1),
                           mi_col + i, &ref_buf->sf);
    }

    xd->mb_to_left_edge = -(((mi_col + i) * MI_SIZE) * 8);
    xd->mb_to_right_edge =
        mb_to_right_edge_base + (xd->n8_w - i - mi_step) * 64;
    mi_x = (mi_col + i) << MI_SIZE_LOG2;
    mi_y = (mi_row << MI_SIZE_LOG2) + xd->n8_h * 4;

    for (j = 0; j < MAX_MB_PLANE; ++j) {
      const struct macroblockd_plane *pd = &xd->plane[j];
      bw = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_x;
      bh = (num_4x4_blocks_high_lookup[bsize] << 1) >> pd->subsampling_y;

      if (mbmi->sb_type < BLOCK_8X8 && !CONFIG_CB4X4) {
        const PARTITION_TYPE bp = BLOCK_8X8 - mbmi->sb_type;
        const int have_vsplit = bp != PARTITION_HORZ;
        const int have_hsplit = bp != PARTITION_VERT;
        const int num_4x4_w = 2 >> (!have_vsplit);
        const int num_4x4_h = 2 >> (!have_hsplit);
        const int pw = 8 >> (have_vsplit + pd->subsampling_x);
        int x, y;

        for (y = 0; y < num_4x4_h; ++y)
          for (x = 0; x < num_4x4_w; ++x) {
            if ((bp == PARTITION_HORZ || bp == PARTITION_SPLIT) && y != 0)
              continue;

            build_inter_predictors(
                xd, j, mi_col_offset, mi_row_offset, y * 2 + x, bw, bh,
                (4 * x) >> pd->subsampling_x,
                xd->n8_h == 1 ? (4 >> pd->subsampling_y) : 0, pw, bh,
#if CONFIG_SUPERTX && CONFIG_EXT_INTER
                0, 0,
#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
                mi_x, mi_y);
          }
      } else {
#if CONFIG_WARPED_MOTION
        if (mbmi->motion_mode == WARPED_CAUSAL && WARP_NEIGHBORS_WITH_OBMC) {
          assert_motion_mode_valid(WARPED_CAUSAL,
#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
                                   0, cm->global_motion,
#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
                                   mi);

          av1_warp_plane(&mbmi->wm_params[0],
#if CONFIG_AOM_HIGHBITDEPTH
                         xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
#endif  // CONFIG_AOM_HIGHBITDEPTH
                         pd->pre[0].buf0, pd->pre[0].width, pd->pre[0].height,
                         pd->pre[0].stride, pd->dst.buf,
                         (((mi_col + i) * MI_SIZE) >> pd->subsampling_x),
                         ((mi_row * MI_SIZE) >> pd->subsampling_y), bw, bh,
                         pd->dst.stride, pd->subsampling_x, pd->subsampling_y,
                         16, 16, 0);

        } else {
#endif  // CONFIG_WARPED_MOTION
          build_inter_predictors(
              xd, j, mi_col_offset, mi_row_offset, 0, bw, bh, 0,
              xd->n8_h == 1 ? (4 >> pd->subsampling_y) : 0, bw, bh,
#if CONFIG_SUPERTX && CONFIG_EXT_INTER
              0, 0,
#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
              mi_x, mi_y);
#if CONFIG_WARPED_MOTION
        }
#endif  // CONFIG_WARPED_MOTION
      }
    }
#if CONFIG_EXT_INTER
    *mbmi = backup_mbmi;
#endif  // CONFIG_EXT_INTER
  }
  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
  xd->mb_to_right_edge = mb_to_right_edge_base;
  xd->mb_to_top_edge += xd->n8_h * 32;
}

void av1_build_prediction_by_right_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                         int mi_row, int mi_col,
                                         uint8_t *tmp_buf[MAX_MB_PLANE],
                                         int tmp_width[MAX_MB_PLANE],
                                         int tmp_height[MAX_MB_PLANE],
                                         const int tmp_stride[MAX_MB_PLANE]) {
  const TileInfo *const tile = &xd->tile;
  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
  int i, j, mi_step, ref;
  const int ilimit = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
  int mb_to_bottom_edge_base = xd->mb_to_bottom_edge;

  if (mi_col + xd->n8_w >= tile->mi_col_end ||
      (mi_col + xd->n8_w) % MI_SIZE == 0 || (mi_col + xd->n8_w) >= cm->mi_cols)
    return;

  xd->mb_to_left_edge -= xd->n8_w * 32;
  for (i = 0; i < ilimit; i += mi_step) {
    int mi_row_offset = i;
    int mi_col_offset = xd->n8_w;
    int mi_x, mi_y, bw, bh;
    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
    MB_MODE_INFO *mbmi = &mi->mbmi;
#if CONFIG_EXT_INTER
    MB_MODE_INFO backup_mbmi;
#endif  // CONFIG_EXT_INTER

    mi_step = AOMMIN(xd->n8_h, mi_size_high[mbmi->sb_type]);

    if (!is_neighbor_overlappable(mbmi)) continue;

#if CONFIG_EXT_INTER
    backup_mbmi = *mbmi;
    modify_neighbor_predictor_for_obmc(mbmi);
#endif  // CONFIG_EXT_INTER

    for (j = 0; j < MAX_MB_PLANE; ++j) {
      struct macroblockd_plane *const pd = &xd->plane[j];
      setup_pred_plane(&pd->dst, tmp_buf[j], tmp_width[j], tmp_height[j],
                       tmp_stride[j], i, xd->n8_w >> 1, NULL, pd->subsampling_x,
                       pd->subsampling_y);
    }
    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
      const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];

      xd->block_refs[ref] = ref_buf;
      if ((!av1_is_valid_scale(&ref_buf->sf)))
        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
                           "Reference frame has invalid dimensions");
      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + i,
                           mi_col + (xd->n8_w >> 1), &ref_buf->sf);
    }

    xd->mb_to_top_edge = -(((mi_row + i) * MI_SIZE) * 8);
    xd->mb_to_bottom_edge =
        mb_to_bottom_edge_base + (xd->n8_h - i - mi_step) * 64;
    mi_x = (mi_col << MI_SIZE_LOG2) + xd->n8_w * 4;
    mi_y = (mi_row + i) << MI_SIZE_LOG2;

    for (j = 0; j < MAX_MB_PLANE; ++j) {
      const struct macroblockd_plane *pd = &xd->plane[j];
      bw = (num_4x4_blocks_wide_lookup[bsize] << 1) >> pd->subsampling_x;
      bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;

      if (mbmi->sb_type < BLOCK_8X8 && !CONFIG_CB4X4) {
        const PARTITION_TYPE bp = BLOCK_8X8 - mbmi->sb_type;
        const int have_vsplit = bp != PARTITION_HORZ;
        const int have_hsplit = bp != PARTITION_VERT;
        const int num_4x4_w = 2 >> (!have_vsplit);
        const int num_4x4_h = 2 >> (!have_hsplit);
        const int ph = 8 >> (have_hsplit + pd->subsampling_y);
        int x, y;

        for (y = 0; y < num_4x4_h; ++y)
          for (x = 0; x < num_4x4_w; ++x) {
            if ((bp == PARTITION_VERT || bp == PARTITION_SPLIT) && x != 0)
              continue;

            build_inter_predictors(xd, j, mi_col_offset, mi_row_offset,
                                   y * 2 + x, bw, bh,
                                   xd->n8_w == 1 ? 4 >> pd->subsampling_x : 0,
                                   (4 * y) >> pd->subsampling_y, bw, ph,
#if CONFIG_SUPERTX && CONFIG_EXT_INTER
                                   0, 0,
#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
                                   mi_x, mi_y);
          }
      } else {
#if CONFIG_WARPED_MOTION
        if (mbmi->motion_mode == WARPED_CAUSAL && WARP_NEIGHBORS_WITH_OBMC) {
          assert_motion_mode_valid(WARPED_CAUSAL,
#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
                                   0, cm->global_motion,
#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
                                   mi);

          av1_warp_plane(&mbmi->wm_params[0],
#if CONFIG_AOM_HIGHBITDEPTH
                         xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
#endif  // CONFIG_AOM_HIGHBITDEPTH
                         pd->pre[0].buf0, pd->pre[0].width, pd->pre[0].height,
                         pd->pre[0].stride, pd->dst.buf,
                         ((mi_col * MI_SIZE) >> pd->subsampling_x),
                         (((mi_row + i) * MI_SIZE) >> pd->subsampling_y), bw,
                         bh, pd->dst.stride, pd->subsampling_x,
                         pd->subsampling_y, 16, 16, 0);

        } else {
#endif  // CONFIG_WARPED_MOTION
          build_inter_predictors(xd, j, mi_col_offset, mi_row_offset, 0, bw, bh,
                                 xd->n8_w == 1 ? 4 >> pd->subsampling_x : 0, 0,
                                 bw, bh,
#if CONFIG_SUPERTX && CONFIG_EXT_INTER
                                 0, 0,
#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
                                 mi_x, mi_y);
#if CONFIG_WARPED_MOTION
        }
#endif  // CONFIG_WARPED_MOTION
      }
    }
#if CONFIG_EXT_INTER
    *mbmi = backup_mbmi;
#endif  // CONFIG_EXT_INTER
  }
  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
  xd->mb_to_bottom_edge = mb_to_bottom_edge_base;
  xd->mb_to_left_edge += xd->n8_w * 32;
}

// This function combines motion compensated predictions that is generated by
// bottom/right neighboring blocks' inter predictors with prediction in dst
// buffer.
void av1_merge_dst_bottom_right_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                      int mi_row, int mi_col,
                                      uint8_t *bottom[MAX_MB_PLANE],
                                      const int bottom_stride[MAX_MB_PLANE],
                                      uint8_t *right[MAX_MB_PLANE],
                                      const int right_stride[MAX_MB_PLANE]) {
  const TileInfo *const tile = &xd->tile;
  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
  int plane, i, mi_step;
  const int bottom_available = mi_row + xd->n8_h < tile->mi_row_end &&
                               (mi_row + xd->n8_h) % MI_SIZE != 0 &&
                               (mi_row + xd->n8_h) < cm->mi_rows;
#if CONFIG_AOM_HIGHBITDEPTH
  int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
#endif  // CONFIG_AOM_HIGHBITDEPTH

  // handle bottom row
  for (i = 0; bottom_available && i < AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
       i += mi_step) {
    int mi_row_offset = xd->n8_h;
    int mi_col_offset = i;
    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
    MB_MODE_INFO *mbmi = &mi->mbmi;
    int overlap;

    mi_step = AOMMIN(xd->n8_w, mi_size_wide[mbmi->sb_type]);

    if (!is_neighbor_overlappable(mbmi)) continue;

    overlap = num_4x4_blocks_high_lookup[bsize] << 1;

    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
      const struct macroblockd_plane *pd = &xd->plane[plane];
      const int bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
      const int bh = overlap >> pd->subsampling_y;
      const int dst_stride = pd->dst.stride;
      uint8_t *dst =
          &pd->dst.buf[((i * MI_SIZE) >> pd->subsampling_x) +
                       (((xd->n8_h * MI_SIZE - overlap) * dst_stride) >>
                        pd->subsampling_y)];
      const int tmp_stride = bottom_stride[plane];
      const uint8_t *const tmp =
          &bottom[plane][((i * MI_SIZE) >> pd->subsampling_x) +
                         (((xd->n8_h * MI_SIZE - overlap) * tmp_stride) >>
                          pd->subsampling_y)];
      const uint8_t *const mask = av1_get_obmc_mask_flipped(bh);

#if CONFIG_AOM_HIGHBITDEPTH
      if (is_hbd)
        aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
                                   tmp_stride, mask, bh, bw, xd->bd);
      else
#endif  // CONFIG_AOM_HIGHBITDEPTH
        aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
                            mask, bh, bw);
    }
  }  // each mi in the bottom row

  // handle right column
  if (mi_col + xd->n8_w >= tile->mi_col_end ||
      (mi_col + xd->n8_w) % MI_SIZE == 0 || (mi_col + xd->n8_w) >= cm->mi_cols)
    return;

  for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
    int mi_row_offset = i;
    int mi_col_offset = xd->n8_w;
    int overlap;
    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
    MB_MODE_INFO *mbmi = &mi->mbmi;

    mi_step = AOMMIN(xd->n8_h, mi_size_high[mbmi->sb_type]);

    if (!is_neighbor_overlappable(mbmi)) continue;

    overlap = num_4x4_blocks_wide_lookup[bsize] << 1;

    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
      const struct macroblockd_plane *pd = &xd->plane[plane];
      const int bw = overlap >> pd->subsampling_x;
      const int bh = (mi_step * MI_SIZE) >> pd->subsampling_y;
      const int dst_stride = pd->dst.stride;
      uint8_t *dst =
          &pd->dst.buf[((i * MI_SIZE * dst_stride) >> pd->subsampling_y) +
                       ((xd->n8_w * MI_SIZE - overlap) >> pd->subsampling_x)];
      const int tmp_stride = right_stride[plane];
      const uint8_t *const tmp =
          &right[plane][((i * MI_SIZE * tmp_stride) >> pd->subsampling_y) +
                        ((xd->n8_w * MI_SIZE - overlap) >> pd->subsampling_x)];
      const uint8_t *const mask = av1_get_obmc_mask_flipped(bw);

#if CONFIG_AOM_HIGHBITDEPTH
      if (is_hbd)
        aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
                                   tmp_stride, mask, bh, bw, xd->bd);
      else
#endif  // CONFIG_AOM_HIGHBITDEPTH
        aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
                            mask, bh, bw);
    }
  }  // each mi in the right column
}

// This function generates 4 sided obmc. (1) Prediction blocks generated by
// bottom and right motion vectors are calculated. (2) Combine them with the
// original prediction block (which should be pre-stored in xd->plane[].dst.buf
// before calling this function). The results is updated in xd->plane[].dst.buf
// (3) Call causal obmc prediction function, which will generate left and above
// preds, and then merge them and xd->plane[].dst.buf.
void av1_build_ncobmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          int mi_row, int mi_col) {
#if CONFIG_AOM_HIGHBITDEPTH
  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
#else
  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
#endif  // CONFIG_AOM_HIGHBITDEPTH
  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };

#if CONFIG_AOM_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    int len = sizeof(uint16_t);
    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
  } else {
#endif  // CONFIG_AOM_HIGHBITDEPTH
    dst_buf1[0] = tmp_buf1;
    dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
    dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
    dst_buf2[0] = tmp_buf2;
    dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
    dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
#if CONFIG_AOM_HIGHBITDEPTH
  }
#endif  // CONFIG_AOM_HIGHBITDEPTH

  av1_build_prediction_by_bottom_preds(cm, xd, mi_row, mi_col, dst_buf1,
                                       dst_width1, dst_height1, dst_stride1);
  av1_build_prediction_by_right_preds(cm, xd, mi_row, mi_col, dst_buf2,
                                      dst_width2, dst_height2, dst_stride2);
  av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
  av1_merge_dst_bottom_right_preds(cm, xd, mi_row, mi_col, dst_buf1,
                                   dst_stride1, dst_buf2, dst_stride2);
  av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
  av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
  av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
}
#endif  // CONFIG_NCOBMC
#endif  // CONFIG_MOTION_VAR

#if CONFIG_EXT_INTER
/* clang-format off */
#if CONFIG_EXT_PARTITION
static const int ii_weights1d[MAX_SB_SIZE] = {
  102, 100, 97, 95, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 73, 71, 69, 68, 67,
  65,  64,  62, 61, 60, 59, 58, 57, 55, 54, 53, 52, 52, 51, 50, 49, 48, 47, 47,
  46,  45,  45, 44, 43, 43, 42, 41, 41, 40, 40, 39, 39, 38, 38, 38, 37, 37, 36,
  36,  36,  35, 35, 35, 34, 34, 34, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 31,
  31,  31,  31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 28,
  28,  28,  28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 27, 27, 27,
  27,  27,  27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
};
static int ii_size_scales[BLOCK_SIZES] = {
#if CONFIG_CB4X4
    32, 32, 32,
#endif
    32, 16, 16, 16, 8, 8, 8, 4,
    4,  4,  2,  2,  2, 1, 1, 1,
};
#else
static const int ii_weights1d[MAX_SB_SIZE] = {
  102, 100, 97, 95, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 73, 71,
  69,  68,  67, 65, 64, 62, 61, 60, 59, 58, 57, 55, 54, 53, 52, 52,
  51,  50,  49, 48, 47, 47, 46, 45, 45, 44, 43, 43, 42, 41, 41, 40,
  40,  39,  39, 38, 38, 38, 37, 37, 36, 36, 36, 35, 35, 35, 34, 34,
};
static int ii_size_scales[BLOCK_SIZES] = {
#if CONFIG_CB4X4
    16, 16, 16,
#endif
    16, 8, 8, 8, 4, 4, 4,
    2,  2, 2, 1, 1, 1,
};
/* clang-format on */
#endif  // CONFIG_EXT_PARTITION

static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra,
                               int wedge_index, int wedge_sign,
                               BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
                               uint8_t *comppred, int compstride,
                               const uint8_t *interpred, int interstride,
                               const uint8_t *intrapred, int intrastride) {
  const int bw = block_size_wide[plane_bsize];
  const int bh = block_size_high[plane_bsize];
  const int size_scale = ii_size_scales[plane_bsize];
  int i, j;

  if (use_wedge_interintra) {
    if (is_interintra_wedge_used(bsize)) {
      const uint8_t *mask =
          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
      const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
      const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
      aom_blend_a64_mask(comppred, compstride, intrapred, intrastride,
                         interpred, interstride, mask, block_size_wide[bsize],
                         bh, bw, subh, subw);
    }
    return;
  }

  switch (mode) {
    case II_V_PRED:
      for (i = 0; i < bh; ++i) {
        for (j = 0; j < bw; ++j) {
          int scale = ii_weights1d[i * size_scale];
          comppred[i * compstride + j] =
              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
                             interpred[i * interstride + j]);
        }
      }
      break;

    case II_H_PRED:
      for (i = 0; i < bh; ++i) {
        for (j = 0; j < bw; ++j) {
          int scale = ii_weights1d[j * size_scale];
          comppred[i * compstride + j] =
              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
                             interpred[i * interstride + j]);
        }
      }
      break;

    case II_D63_PRED:
    case II_D117_PRED:
      for (i = 0; i < bh; ++i) {
        for (j = 0; j < bw; ++j) {
          int scale = (ii_weights1d[i * size_scale] * 3 +
                       ii_weights1d[j * size_scale]) >>
                      2;
          comppred[i * compstride + j] =
              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
                             interpred[i * interstride + j]);
        }
      }
      break;

    case II_D207_PRED:
    case II_D153_PRED:
      for (i = 0; i < bh; ++i) {
        for (j = 0; j < bw; ++j) {
          int scale = (ii_weights1d[j * size_scale] * 3 +
                       ii_weights1d[i * size_scale]) >>
                      2;
          comppred[i * compstride + j] =
              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
                             interpred[i * interstride + j]);
        }
      }
      break;

    case II_D135_PRED:
      for (i = 0; i < bh; ++i) {
        for (j = 0; j < bw; ++j) {
          int scale = ii_weights1d[(i < j ? i : j) * size_scale];
          comppred[i * compstride + j] =
              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
                             interpred[i * interstride + j]);
        }
      }
      break;

    case II_D45_PRED:
      for (i = 0; i < bh; ++i) {
        for (j = 0; j < bw; ++j) {
          int scale =
              (ii_weights1d[i * size_scale] + ii_weights1d[j * size_scale]) >>
              1;
          comppred[i * compstride + j] =
              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
                             interpred[i * interstride + j]);
        }
      }
      break;

    case II_TM_PRED:
    case II_DC_PRED:
    default:
      for (i = 0; i < bh; ++i) {
        for (j = 0; j < bw; ++j) {
          comppred[i * compstride + j] = AOM_BLEND_AVG(
              intrapred[i * intrastride + j], interpred[i * interstride + j]);
        }
      }
      break;
  }
}

#if CONFIG_AOM_HIGHBITDEPTH
static void combine_interintra_highbd(
    INTERINTRA_MODE mode, int use_wedge_interintra, int wedge_index,
    int wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
    uint8_t *comppred8, int compstride, const uint8_t *interpred8,
    int interstride, const uint8_t *intrapred8, int intrastride, int bd) {
  const int bw = block_size_wide[plane_bsize];
  const int bh = block_size_high[plane_bsize];
  const int size_scale = ii_size_scales[plane_bsize];
  int i, j;

  uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8);
  const uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
  const uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);

  if (use_wedge_interintra) {
    if (is_interintra_wedge_used(bsize)) {
      const uint8_t *mask =
          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
      const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
      const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
      aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
                                interpred8, interstride, mask, bw, bh, bw, subh,
                                subw, bd);
    }
    return;
  }

  switch (mode) {
    case II_V_PRED:
      for (i = 0; i < bh; ++i) {
        for (j = 0; j < bw; ++j) {
          int scale = ii_weights1d[i * size_scale];
          comppred[i * compstride + j] =
              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
                             interpred[i * interstride + j]);
        }
      }
      break;

    case II_H_PRED:
      for (i = 0; i < bh; ++i) {
        for (j = 0; j < bw; ++j) {
          int scale = ii_weights1d[j * size_scale];
          comppred[i * compstride + j] =
              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
                             interpred[i * interstride + j]);
        }
      }
      break;

    case II_D63_PRED:
    case II_D117_PRED:
      for (i = 0; i < bh; ++i) {
        for (j = 0; j < bw; ++j) {
          int scale = (ii_weights1d[i * size_scale] * 3 +
                       ii_weights1d[j * size_scale]) >>
                      2;
          comppred[i * compstride + j] =
              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
                             interpred[i * interstride + j]);
        }
      }
      break;

    case II_D207_PRED:
    case II_D153_PRED:
      for (i = 0; i < bh; ++i) {
        for (j = 0; j < bw; ++j) {
          int scale = (ii_weights1d[j * size_scale] * 3 +
                       ii_weights1d[i * size_scale]) >>
                      2;
          comppred[i * compstride + j] =
              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
                             interpred[i * interstride + j]);
        }
      }
      break;

    case II_D135_PRED:
      for (i = 0; i < bh; ++i) {
        for (j = 0; j < bw; ++j) {
          int scale = ii_weights1d[(i < j ? i : j) * size_scale];
          comppred[i * compstride + j] =
              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
                             interpred[i * interstride + j]);
        }
      }
      break;

    case II_D45_PRED:
      for (i = 0; i < bh; ++i) {
        for (j = 0; j < bw; ++j) {
          int scale =
              (ii_weights1d[i * size_scale] + ii_weights1d[j * size_scale]) >>
              1;
          comppred[i * compstride + j] =
              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
                             interpred[i * interstride + j]);
        }
      }
      break;

    case II_TM_PRED:
    case II_DC_PRED:
    default:
      for (i = 0; i < bh; ++i) {
        for (j = 0; j < bw; ++j) {
          comppred[i * compstride + j] = AOM_BLEND_AVG(
              interpred[i * interstride + j], intrapred[i * intrastride + j]);
        }
      }
      break;
  }
}
#endif  // CONFIG_AOM_HIGHBITDEPTH

void av1_build_intra_predictors_for_interintra(MACROBLOCKD *xd,
                                               BLOCK_SIZE bsize, int plane,
                                               BUFFER_SET *ctx, uint8_t *dst,
                                               int dst_stride) {
  struct macroblockd_plane *const pd = &xd->plane[plane];
  BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
  PREDICTION_MODE mode =
      interintra_to_intra_mode[xd->mi[0]->mbmi.interintra_mode];

  av1_predict_intra_block(xd, pd->width, pd->height, plane_bsize, mode,
                          ctx->plane[plane], ctx->stride[plane], dst,
                          dst_stride, 0, 0, plane);
}

void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
                            const uint8_t *inter_pred, int inter_stride,
                            const uint8_t *intra_pred, int intra_stride) {
  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
#if CONFIG_AOM_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    combine_interintra_highbd(
        xd->mi[0]->mbmi.interintra_mode, xd->mi[0]->mbmi.use_wedge_interintra,
        xd->mi[0]->mbmi.interintra_wedge_index,
        xd->mi[0]->mbmi.interintra_wedge_sign, bsize, plane_bsize,
        xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, inter_pred,
        inter_stride, intra_pred, intra_stride, xd->bd);
    return;
  }
#endif  // CONFIG_AOM_HIGHBITDEPTH
  combine_interintra(xd->mi[0]->mbmi.interintra_mode,
                     xd->mi[0]->mbmi.use_wedge_interintra,
                     xd->mi[0]->mbmi.interintra_wedge_index,
                     xd->mi[0]->mbmi.interintra_wedge_sign, bsize, plane_bsize,
                     xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
                     inter_pred, inter_stride, intra_pred, intra_stride);
}

void av1_build_interintra_predictors_sby(MACROBLOCKD *xd, uint8_t *ypred,
                                         int ystride, BUFFER_SET *ctx,
                                         BLOCK_SIZE bsize) {
#if CONFIG_AOM_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
    av1_build_intra_predictors_for_interintra(
        xd, bsize, 0, ctx, CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
    av1_combine_interintra(xd, bsize, 0, ypred, ystride,
                           CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
    return;
  }
#endif  // CONFIG_AOM_HIGHBITDEPTH
  {
    DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
    av1_build_intra_predictors_for_interintra(xd, bsize, 0, ctx, intrapredictor,
                                              MAX_SB_SIZE);
    av1_combine_interintra(xd, bsize, 0, ypred, ystride, intrapredictor,
                           MAX_SB_SIZE);
  }
}

void av1_build_interintra_predictors_sbc(MACROBLOCKD *xd, uint8_t *upred,
                                         int ustride, BUFFER_SET *ctx,
                                         int plane, BLOCK_SIZE bsize) {
#if CONFIG_AOM_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    DECLARE_ALIGNED(16, uint16_t, uintrapredictor[MAX_SB_SQUARE]);
    av1_build_intra_predictors_for_interintra(
        xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(uintrapredictor),
        MAX_SB_SIZE);
    av1_combine_interintra(xd, bsize, plane, upred, ustride,
                           CONVERT_TO_BYTEPTR(uintrapredictor), MAX_SB_SIZE);
    return;
  }
#endif  // CONFIG_AOM_HIGHBITDEPTH
  {
    DECLARE_ALIGNED(16, uint8_t, uintrapredictor[MAX_SB_SQUARE]);
    av1_build_intra_predictors_for_interintra(xd, bsize, plane, ctx,
                                              uintrapredictor, MAX_SB_SIZE);
    av1_combine_interintra(xd, bsize, plane, upred, ustride, uintrapredictor,
                           MAX_SB_SIZE);
  }
}

void av1_build_interintra_predictors_sbuv(MACROBLOCKD *xd, uint8_t *upred,
                                          uint8_t *vpred, int ustride,
                                          int vstride, BUFFER_SET *ctx,
                                          BLOCK_SIZE bsize) {
  av1_build_interintra_predictors_sbc(xd, upred, ustride, ctx, 1, bsize);
  av1_build_interintra_predictors_sbc(xd, vpred, vstride, ctx, 2, bsize);
}

void av1_build_interintra_predictors(MACROBLOCKD *xd, uint8_t *ypred,
                                     uint8_t *upred, uint8_t *vpred,
                                     int ystride, int ustride, int vstride,
                                     BUFFER_SET *ctx, BLOCK_SIZE bsize) {
  av1_build_interintra_predictors_sby(xd, ypred, ystride, ctx, bsize);
  av1_build_interintra_predictors_sbuv(xd, upred, vpred, ustride, vstride, ctx,
                                       bsize);
}

// Builds the inter-predictor for the single ref case
// for use in the encoder to search the wedges efficiently.
static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
                                              int block, int bw, int bh, int x,
                                              int y, int w, int h, int mi_x,
                                              int mi_y, int ref,
                                              uint8_t *const ext_dst,
                                              int ext_dst_stride) {
  struct macroblockd_plane *const pd = &xd->plane[plane];
  const MODE_INFO *mi = xd->mi[0];

  const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
  struct buf_2d *const pre_buf = &pd->pre[ref];
#if CONFIG_AOM_HIGHBITDEPTH
  uint8_t *const dst =
      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? CONVERT_TO_BYTEPTR(ext_dst)
                                                   : ext_dst) +
      ext_dst_stride * y + x;
#else
  uint8_t *const dst = ext_dst + ext_dst_stride * y + x;
#endif
  const MV mv = mi->mbmi.sb_type < BLOCK_8X8
                    ? average_split_mvs(pd, mi, ref, block)
                    : mi->mbmi.mv[ref].as_mv;

  // TODO(jkoleszar): This clamping is done in the incorrect place for the
  // scaling case. It needs to be done on the scaled MV, not the pre-scaling
  // MV. Note however that it performs the subsampling aware scaling so
  // that the result is always q4.
  // mv_precision precision is MV_PRECISION_Q4.
  const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, pd->subsampling_x,
                                             pd->subsampling_y);

  uint8_t *pre;
  MV32 scaled_mv;
  int xs, ys, subpel_x, subpel_y;
  const int is_scaled = av1_is_scaled(sf);
  ConvolveParams conv_params = get_conv_params(0, plane);
#if CONFIG_GLOBAL_MOTION
  WarpedMotionParams *const wm = &xd->global_motion[mi->mbmi.ref_frame[ref]];
  const int is_global = is_global_mv_block(mi, block, wm->wmtype);
#endif  // CONFIG_GLOBAL_MOTION

  if (is_scaled) {
    pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
    scaled_mv = av1_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
    xs = sf->x_step_q4;
    ys = sf->y_step_q4;
  } else {
    pre = pre_buf->buf + (y * pre_buf->stride + x);
    scaled_mv.row = mv_q4.row;
    scaled_mv.col = mv_q4.col;
    xs = ys = 16;
  }

  subpel_x = scaled_mv.col & SUBPEL_MASK;
  subpel_y = scaled_mv.row & SUBPEL_MASK;
  pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride +
         (scaled_mv.col >> SUBPEL_BITS);

  av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride, subpel_x,
                           subpel_y, sf, w, h, &conv_params,
                           mi->mbmi.interp_filter,
#if CONFIG_GLOBAL_MOTION
                           is_global, (mi_x >> pd->subsampling_x) + x,
                           (mi_y >> pd->subsampling_y) + y, plane, ref,
#if CONFIG_MOTION_VAR
                           0, 0,
#endif
#endif  // CONFIG_GLOBAL_MOTION
                           xs, ys, xd);
}

void av1_build_inter_predictors_for_planes_single_buf(
    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3]) {
  int plane;
  const int mi_x = mi_col * MI_SIZE;
  const int mi_y = mi_row * MI_SIZE;
  for (plane = plane_from; plane <= plane_to; ++plane) {
    const BLOCK_SIZE plane_bsize =
        get_plane_block_size(bsize, &xd->plane[plane]);
    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
    const int bw = block_size_wide[plane_bsize];
    const int bh = block_size_high[plane_bsize];

    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8 && !CONFIG_CB4X4) {
      int x, y;
      assert(bsize == BLOCK_8X8);
      for (y = 0; y < num_4x4_h; ++y)
        for (x = 0; x < num_4x4_w; ++x)
          build_inter_predictors_single_buf(
              xd, plane, y * 2 + x, bw, bh, 4 * x, 4 * y, 4, 4, mi_x, mi_y, ref,
              ext_dst[plane], ext_dst_stride[plane]);
    } else {
      build_inter_predictors_single_buf(xd, plane, 0, bw, bh, 0, 0, bw, bh,
                                        mi_x, mi_y, ref, ext_dst[plane],
                                        ext_dst_stride[plane]);
    }
  }
}

static void build_wedge_inter_predictor_from_buf(
    MACROBLOCKD *xd, int plane, int x, int y, int w, int h,
#if CONFIG_SUPERTX
    int wedge_offset_x, int wedge_offset_y,
#endif  // CONFIG_SUPERTX
    uint8_t *ext_dst0, int ext_dst_stride0, uint8_t *ext_dst1,
    int ext_dst_stride1) {
  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
  const int is_compound = has_second_ref(mbmi);
  MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
  struct buf_2d *const dst_buf = &pd->dst;
  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
  INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_compound_data;

  if (is_compound &&
      is_masked_compound_type(mbmi->interinter_compound_data.type)) {
#if CONFIG_COMPOUND_SEGMENT
#if CONFIG_AOM_HIGHBITDEPTH
    if (!plane && comp_data->type == COMPOUND_SEG) {
      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
        build_compound_seg_mask_highbd(
            comp_data->seg_mask, comp_data->mask_type,
            CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
            CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, mbmi->sb_type, h, w,
            xd->bd);
      else
        build_compound_seg_mask(comp_data->seg_mask, comp_data->mask_type,
                                ext_dst0, ext_dst_stride0, ext_dst1,
                                ext_dst_stride1, mbmi->sb_type, h, w);
    }
#else
    if (!plane && comp_data->type == COMPOUND_SEG)
      build_compound_seg_mask(comp_data->seg_mask, comp_data->mask_type,
                              ext_dst0, ext_dst_stride0, ext_dst1,
                              ext_dst_stride1, mbmi->sb_type, h, w);
#endif  // CONFIG_AOM_HIGHBITDEPTH
#endif  // CONFIG_COMPOUND_SEGMENT

#if CONFIG_SUPERTX
#if CONFIG_AOM_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
      build_masked_compound_wedge_extend_highbd(
          dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
          mbmi->sb_type, wedge_offset_x, wedge_offset_y, h, w, xd->bd);
    else
#endif  // CONFIG_AOM_HIGHBITDEPTH
      build_masked_compound_wedge_extend(
          dst, dst_buf->stride, ext_dst0, ext_dst_stride0, ext_dst1,
          ext_dst_stride1, comp_data, mbmi->sb_type, wedge_offset_x,
          wedge_offset_y, h, w);
#else
#if CONFIG_AOM_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
      build_masked_compound_highbd(
          dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
          mbmi->sb_type, h, w, xd->bd);
    else
#endif  // CONFIG_AOM_HIGHBITDEPTH
      build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
                            ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
                            h, w);
#endif  // CONFIG_SUPERTX
  } else {
#if CONFIG_AOM_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
      aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
                               dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
                               xd->bd);
    else
#endif  // CONFIG_AOM_HIGHBITDEPTH
      aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
                        0, NULL, 0, w, h);
  }
}

void av1_build_wedge_inter_predictor_from_buf(
    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to,
#if CONFIG_SUPERTX
    int wedge_offset_x, int wedge_offset_y,
#endif  // CONFIG_SUPERTX
    uint8_t *ext_dst0[3], int ext_dst_stride0[3], uint8_t *ext_dst1[3],
    int ext_dst_stride1[3]) {
  int plane;
  for (plane = plane_from; plane <= plane_to; ++plane) {
    const BLOCK_SIZE plane_bsize =
        get_plane_block_size(bsize, &xd->plane[plane]);
    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];

    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8 && !CONFIG_CB4X4) {
      int x, y;
      assert(bsize == BLOCK_8X8);
      for (y = 0; y < num_4x4_h; ++y)
        for (x = 0; x < num_4x4_w; ++x)
          build_wedge_inter_predictor_from_buf(
              xd, plane, 4 * x, 4 * y, 4, 4,
#if CONFIG_SUPERTX
              wedge_offset_x, wedge_offset_y,
#endif  // CONFIG_SUPERTX
              ext_dst0[plane], ext_dst_stride0[plane], ext_dst1[plane],
              ext_dst_stride1[plane]);
    } else {
      const int bw = block_size_wide[plane_bsize];
      const int bh = block_size_high[plane_bsize];
      build_wedge_inter_predictor_from_buf(
          xd, plane, 0, 0, bw, bh,
#if CONFIG_SUPERTX
          wedge_offset_x, wedge_offset_y,
#endif  // CONFIG_SUPERTX
          ext_dst0[plane], ext_dst_stride0[plane], ext_dst1[plane],
          ext_dst_stride1[plane]);
    }
  }
}
#endif  // CONFIG_EXT_INTER
